Add a function to fix SAM question IDs.

2023-07-03 20:41:48 +02:00 · 2023-07-03 20:41:48 +02:00 · 825380a47e
parent ef26772038
commit 825380a47e
1 changed files with 58 additions and 0 deletions
--- a/features/esm_SAM.py
+++ b/features/esm_SAM.py
@ -3,6 +3,9 @@ import pandas as pd

 import features.esm

+SAM_ORIGINAL_MAX = 5
+SAM_ORIGINAL_MIN = 1
+
 QUESTIONNAIRE_ID_SAM = {
    "event_stress": 87,
    "event_threat": 88,
@ -441,3 +444,58 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:

 # TODO: How many questions about the stressfulness of the period were asked
 #  and how does this relate to events?
+
+
+def reassign_question_ids(df_sam_cleaned: pd.DataFrame) -> pd.DataFrame:
+    df_esm_sam_unique_questions = (
+        df_sam_cleaned.groupby("question_id")
+        .esm_instructions.value_counts()
+        .rename()
+        .reset_index()
+    )
+    # Tabulate all possible answers to each question (group by question ID).
+
+    # First, check that we anticipated all esm instructions.
+    for q_id in DICT_SAM_QUESTION_IDS.keys():
+        # Look for all questions ("instructions") occurring in the dataframe.
+        actual_questions = df_esm_sam_unique_questions.loc[
+            df_esm_sam_unique_questions["question_id"] == q_id,
+            "esm_instructions",
+        ]
+        # These are all answers to a given question (by q_id).
+        questions_matches = actual_questions.str.startswith(
+            DICT_SAM_QUESTION_IDS.get(q_id)
+        )
+        # See if they are expected, i.e. included in the dictionary.
+        if ~actual_questions.all():
+            print("One of the questions that occur in the data was undefined.")
+            print("This were the questions found in the data: ")
+            raise KeyError(actual_questions[~questions_matches])
+            # In case there is an unexpected answer, raise an exception.
+
+    # Next, replace question IDs.
+    df_sam_fixed = df_sam_cleaned.copy()
+    df_sam_fixed["question_id"] = df_sam_cleaned["esm_instructions"].apply(
+        lambda x: next(
+            (
+                key
+                for key, values in DICT_SAM_QUESTION_IDS.items()
+                if x.startswith(values)
+            ),
+            None,
+        )
+    )
+
+    # Finally, increment numeric answers.
+    try:
+        df_sam_fixed = df_sam_fixed.assign(
+            esm_user_score=lambda x: x.esm_user_answer_numeric + 1
+        )
+        # Increment the original answer by 1
+        # to keep in line with traditional scoring
+        # (from SAM_ORIGINAL_MIN - SAM_ORIGINAL_MAX).
+    except AttributeError as e:
+        print("Please, clean the dataframe first using features.esm.clean_up_esm.")
+        print(e)
+
+    return df_sam_fixed