Extract method to reuse.

2023-07-03 21:13:50 +02:00 · 2023-07-03 21:13:50 +02:00 · 82b53bc0d3
parent c688580fe8
commit 82b53bc0d3
3 changed files with 66 additions and 126 deletions
--- a/features/esm.py
+++ b/features/esm.py
@ -347,3 +347,69 @@ def increment_answers(df_esm_clean: pd.DataFrame, increment_by=1):
        print("Please, clean the dataframe first using features.esm.clean_up_esm.")
        print(e)
    return df_esm_clean
+
+
+def reassign_question_ids(
+    df_esm_cleaned: pd.DataFrame, question_ids_content: dict
+) -> pd.DataFrame:
+    """
+    Fix question IDs to match their actual content.
+
+    Unfortunately, when altering the protocol to adapt to COVID pandemic,
+    we did not retain original question IDs.
+    This means that for participants before 2021, they are different
+    from for the rest of them.
+    This function searches for question IDs by matching their strings.
+
+    Parameters
+    ----------
+    df_esm_cleaned: pd.DataFrame
+        A cleaned up dataframe, which must also include esm_user_answer_numeric.
+    question_ids_content: dict
+        A dictionary, linking question IDs with their content ("instructions").
+
+    Returns
+    -------
+    df_esm_fixed: pd.DataFrame
+        The same dataframe but with fixed question IDs.
+    """
+    df_esm_unique_questions = (
+        df_esm_cleaned.groupby("question_id")
+        .esm_instructions.value_counts()
+        .rename()
+        .reset_index()
+    )
+    # Tabulate all possible answers to each question (group by question ID).
+
+    # First, check that we anticipated all esm instructions.
+    for q_id in question_ids_content.keys():
+        # Look for all questions ("instructions") occurring in the dataframe.
+        actual_questions = df_esm_unique_questions.loc[
+            df_esm_unique_questions["question_id"] == q_id,
+            "esm_instructions",
+        ]
+        # These are all answers to a given question (by q_id).
+        questions_matches = actual_questions.str.startswith(
+            question_ids_content.get(q_id)
+        )
+        # See if they are expected, i.e. included in the dictionary.
+        if ~actual_questions.all():
+            print("One of the questions that occur in the data was undefined.")
+            print("This were the questions found in the data: ")
+            raise KeyError(actual_questions[~questions_matches])
+            # In case there is an unexpected answer, raise an exception.
+
+    # Next, replace question IDs.
+    df_esm_fixed = df_esm_cleaned.copy()
+    df_esm_fixed["question_id"] = df_esm_cleaned["esm_instructions"].apply(
+        lambda x: next(
+            (
+                key
+                for key, values in question_ids_content.items()
+                if x.startswith(values)
+            ),
+            None,
+        )
+    )
+
+    return df_esm_fixed
--- a/features/esm_COPE.py
+++ b/features/esm_COPE.py
@ -1,5 +1,3 @@
-import pandas as pd
-
 COPE_ORIGINAL_MAX = 4
 COPE_ORIGINAL_MIN = 1

@ -125,65 +123,3 @@ DICT_COPE_QUESTION_IDS = {
        "Razburil sem se in razmišljal samo o tem",
    ),
 }
-
-
-def reassign_question_ids(df_cope_cleaned: pd.DataFrame) -> pd.DataFrame:
-    """
-    Fix question IDs to match their actual content.
-
-    Unfortunately, when altering the protocol to adapt to COVID pandemic,
-    we did not retain original question IDs.
-    This means that for participants before 2021, they are different
-    from for the rest of them.
-    This function searches for question IDs by matching their strings.
-
-    Parameters
-    ----------
-    df_cope_cleaned: pd.DataFrame
-        A cleaned up dataframe, which must also include esm_user_answer_numeric.
-
-    Returns
-    -------
-    df_cope_fixed: pd.DataFrame
-        The same dataframe but with fixed question IDs.
-    """
-    df_esm_cope_unique_questions = (
-        df_cope_cleaned.groupby("question_id")
-        .esm_instructions.value_counts()
-        .rename()
-        .reset_index()
-    )
-    # Tabulate all possible answers to each question (group by question ID).
-
-    # First, check that we anticipated all esm instructions.
-    for q_id in DICT_COPE_QUESTION_IDS.keys():
-        # Look for all questions ("instructions") occurring in the dataframe.
-        actual_questions = df_esm_cope_unique_questions.loc[
-            df_esm_cope_unique_questions["question_id"] == q_id,
-            "esm_instructions",
-        ]
-        # These are all answers to a given question (by q_id).
-        questions_matches = actual_questions.str.startswith(
-            DICT_COPE_QUESTION_IDS.get(q_id)
-        )
-        # See if they are expected, i.e. included in the dictionary.
-        if ~actual_questions.all():
-            print("One of the questions that occur in the data was undefined.")
-            print("This were the questions found in the data: ")
-            raise KeyError(actual_questions[~questions_matches])
-            # In case there is an unexpected answer, raise an exception.
-
-    # Next, replace question IDs.
-    df_cope_fixed = df_cope_cleaned.copy()
-    df_cope_fixed["question_id"] = df_cope_cleaned["esm_instructions"].apply(
-        lambda x: next(
-            (
-                key
-                for key, values in DICT_COPE_QUESTION_IDS.items()
-                if x.startswith(values)
-            ),
-            None,
-        )
-    )
-
-    return df_cope_fixed
--- a/features/esm_SAM.py
+++ b/features/esm_SAM.py
@ -444,65 +444,3 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:

 # TODO: How many questions about the stressfulness of the period were asked
 #  and how does this relate to events?
-
-
-def reassign_question_ids(df_sam_cleaned: pd.DataFrame) -> pd.DataFrame:
-    """
-    Fix question IDs to match their actual content.
-
-    Unfortunately, when altering the protocol to adapt to COVID pandemic,
-    we did not retain original question IDs.
-    This means that for participants before 2021, they are different
-    from for the rest of them.
-    This function searches for question IDs by matching their strings.
-
-    Parameters
-    ----------
-    df_sam_cleaned: pd.DataFrame
-        A cleaned up dataframe, which must also include esm_user_answer_numeric.
-
-    Returns
-    -------
-    df_sam_fixed: pd.DataFrame
-        The same dataframe but with fixed question IDs.
-    """
-    df_esm_sam_unique_questions = (
-        df_sam_cleaned.groupby("question_id")
-        .esm_instructions.value_counts()
-        .rename()
-        .reset_index()
-    )
-    # Tabulate all possible answers to each question (group by question ID).
-
-    # First, check that we anticipated all esm instructions.
-    for q_id in DICT_SAM_QUESTION_IDS.keys():
-        # Look for all questions ("instructions") occurring in the dataframe.
-        actual_questions = df_esm_sam_unique_questions.loc[
-            df_esm_sam_unique_questions["question_id"] == q_id,
-            "esm_instructions",
-        ]
-        # These are all answers to a given question (by q_id).
-        questions_matches = actual_questions.str.startswith(
-            DICT_SAM_QUESTION_IDS.get(q_id)
-        )
-        # See if they are expected, i.e. included in the dictionary.
-        if ~actual_questions.all():
-            print("One of the questions that occur in the data was undefined.")
-            print("This were the questions found in the data: ")
-            raise KeyError(actual_questions[~questions_matches])
-            # In case there is an unexpected answer, raise an exception.
-
-    # Next, replace question IDs.
-    df_sam_fixed = df_sam_cleaned.copy()
-    df_sam_fixed["question_id"] = df_sam_cleaned["esm_instructions"].apply(
-        lambda x: next(
-            (
-                key
-                for key, values in DICT_SAM_QUESTION_IDS.items()
-                if x.startswith(values)
-            ),
-            None,
-        )
-    )
-
-    return df_sam_fixed