From 825380a47e2951d3ead6a9a675259ecf5ec4c1aa Mon Sep 17 00:00:00 2001 From: junos Date: Mon, 3 Jul 2023 20:41:48 +0200 Subject: [PATCH] Add a function to fix SAM question IDs. --- features/esm_SAM.py | 58 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/features/esm_SAM.py b/features/esm_SAM.py index 3a8191f..a1f46d7 100644 --- a/features/esm_SAM.py +++ b/features/esm_SAM.py @@ -3,6 +3,9 @@ import pandas as pd import features.esm +SAM_ORIGINAL_MAX = 5 +SAM_ORIGINAL_MIN = 1 + QUESTIONNAIRE_ID_SAM = { "event_stress": 87, "event_threat": 88, @@ -441,3 +444,58 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame: # TODO: How many questions about the stressfulness of the period were asked # and how does this relate to events? + + +def reassign_question_ids(df_sam_cleaned: pd.DataFrame) -> pd.DataFrame: + df_esm_sam_unique_questions = ( + df_sam_cleaned.groupby("question_id") + .esm_instructions.value_counts() + .rename() + .reset_index() + ) + # Tabulate all possible answers to each question (group by question ID). + + # First, check that we anticipated all esm instructions. + for q_id in DICT_SAM_QUESTION_IDS.keys(): + # Look for all questions ("instructions") occurring in the dataframe. + actual_questions = df_esm_sam_unique_questions.loc[ + df_esm_sam_unique_questions["question_id"] == q_id, + "esm_instructions", + ] + # These are all answers to a given question (by q_id). + questions_matches = actual_questions.str.startswith( + DICT_SAM_QUESTION_IDS.get(q_id) + ) + # See if they are expected, i.e. included in the dictionary. + if ~actual_questions.all(): + print("One of the questions that occur in the data was undefined.") + print("This were the questions found in the data: ") + raise KeyError(actual_questions[~questions_matches]) + # In case there is an unexpected answer, raise an exception. + + # Next, replace question IDs. + df_sam_fixed = df_sam_cleaned.copy() + df_sam_fixed["question_id"] = df_sam_cleaned["esm_instructions"].apply( + lambda x: next( + ( + key + for key, values in DICT_SAM_QUESTION_IDS.items() + if x.startswith(values) + ), + None, + ) + ) + + # Finally, increment numeric answers. + try: + df_sam_fixed = df_sam_fixed.assign( + esm_user_score=lambda x: x.esm_user_answer_numeric + 1 + ) + # Increment the original answer by 1 + # to keep in line with traditional scoring + # (from SAM_ORIGINAL_MIN - SAM_ORIGINAL_MAX). + except AttributeError as e: + print("Please, clean the dataframe first using features.esm.clean_up_esm.") + print(e) + + return df_sam_fixed