diff --git a/features/esm.py b/features/esm.py index 94bc73e..129e411 100644 --- a/features/esm.py +++ b/features/esm.py @@ -347,3 +347,69 @@ def increment_answers(df_esm_clean: pd.DataFrame, increment_by=1): print("Please, clean the dataframe first using features.esm.clean_up_esm.") print(e) return df_esm_clean + + +def reassign_question_ids( + df_esm_cleaned: pd.DataFrame, question_ids_content: dict +) -> pd.DataFrame: + """ + Fix question IDs to match their actual content. + + Unfortunately, when altering the protocol to adapt to COVID pandemic, + we did not retain original question IDs. + This means that for participants before 2021, they are different + from for the rest of them. + This function searches for question IDs by matching their strings. + + Parameters + ---------- + df_esm_cleaned: pd.DataFrame + A cleaned up dataframe, which must also include esm_user_answer_numeric. + question_ids_content: dict + A dictionary, linking question IDs with their content ("instructions"). + + Returns + ------- + df_esm_fixed: pd.DataFrame + The same dataframe but with fixed question IDs. + """ + df_esm_unique_questions = ( + df_esm_cleaned.groupby("question_id") + .esm_instructions.value_counts() + .rename() + .reset_index() + ) + # Tabulate all possible answers to each question (group by question ID). + + # First, check that we anticipated all esm instructions. + for q_id in question_ids_content.keys(): + # Look for all questions ("instructions") occurring in the dataframe. + actual_questions = df_esm_unique_questions.loc[ + df_esm_unique_questions["question_id"] == q_id, + "esm_instructions", + ] + # These are all answers to a given question (by q_id). + questions_matches = actual_questions.str.startswith( + question_ids_content.get(q_id) + ) + # See if they are expected, i.e. included in the dictionary. + if ~actual_questions.all(): + print("One of the questions that occur in the data was undefined.") + print("This were the questions found in the data: ") + raise KeyError(actual_questions[~questions_matches]) + # In case there is an unexpected answer, raise an exception. + + # Next, replace question IDs. + df_esm_fixed = df_esm_cleaned.copy() + df_esm_fixed["question_id"] = df_esm_cleaned["esm_instructions"].apply( + lambda x: next( + ( + key + for key, values in question_ids_content.items() + if x.startswith(values) + ), + None, + ) + ) + + return df_esm_fixed diff --git a/features/esm_COPE.py b/features/esm_COPE.py index 5342f26..ab898de 100644 --- a/features/esm_COPE.py +++ b/features/esm_COPE.py @@ -1,5 +1,3 @@ -import pandas as pd - COPE_ORIGINAL_MAX = 4 COPE_ORIGINAL_MIN = 1 @@ -125,65 +123,3 @@ DICT_COPE_QUESTION_IDS = { "Razburil sem se in razmiĆĄljal samo o tem", ), } - - -def reassign_question_ids(df_cope_cleaned: pd.DataFrame) -> pd.DataFrame: - """ - Fix question IDs to match their actual content. - - Unfortunately, when altering the protocol to adapt to COVID pandemic, - we did not retain original question IDs. - This means that for participants before 2021, they are different - from for the rest of them. - This function searches for question IDs by matching their strings. - - Parameters - ---------- - df_cope_cleaned: pd.DataFrame - A cleaned up dataframe, which must also include esm_user_answer_numeric. - - Returns - ------- - df_cope_fixed: pd.DataFrame - The same dataframe but with fixed question IDs. - """ - df_esm_cope_unique_questions = ( - df_cope_cleaned.groupby("question_id") - .esm_instructions.value_counts() - .rename() - .reset_index() - ) - # Tabulate all possible answers to each question (group by question ID). - - # First, check that we anticipated all esm instructions. - for q_id in DICT_COPE_QUESTION_IDS.keys(): - # Look for all questions ("instructions") occurring in the dataframe. - actual_questions = df_esm_cope_unique_questions.loc[ - df_esm_cope_unique_questions["question_id"] == q_id, - "esm_instructions", - ] - # These are all answers to a given question (by q_id). - questions_matches = actual_questions.str.startswith( - DICT_COPE_QUESTION_IDS.get(q_id) - ) - # See if they are expected, i.e. included in the dictionary. - if ~actual_questions.all(): - print("One of the questions that occur in the data was undefined.") - print("This were the questions found in the data: ") - raise KeyError(actual_questions[~questions_matches]) - # In case there is an unexpected answer, raise an exception. - - # Next, replace question IDs. - df_cope_fixed = df_cope_cleaned.copy() - df_cope_fixed["question_id"] = df_cope_cleaned["esm_instructions"].apply( - lambda x: next( - ( - key - for key, values in DICT_COPE_QUESTION_IDS.items() - if x.startswith(values) - ), - None, - ) - ) - - return df_cope_fixed diff --git a/features/esm_SAM.py b/features/esm_SAM.py index c8cc492..edf5a28 100644 --- a/features/esm_SAM.py +++ b/features/esm_SAM.py @@ -444,65 +444,3 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame: # TODO: How many questions about the stressfulness of the period were asked # and how does this relate to events? - - -def reassign_question_ids(df_sam_cleaned: pd.DataFrame) -> pd.DataFrame: - """ - Fix question IDs to match their actual content. - - Unfortunately, when altering the protocol to adapt to COVID pandemic, - we did not retain original question IDs. - This means that for participants before 2021, they are different - from for the rest of them. - This function searches for question IDs by matching their strings. - - Parameters - ---------- - df_sam_cleaned: pd.DataFrame - A cleaned up dataframe, which must also include esm_user_answer_numeric. - - Returns - ------- - df_sam_fixed: pd.DataFrame - The same dataframe but with fixed question IDs. - """ - df_esm_sam_unique_questions = ( - df_sam_cleaned.groupby("question_id") - .esm_instructions.value_counts() - .rename() - .reset_index() - ) - # Tabulate all possible answers to each question (group by question ID). - - # First, check that we anticipated all esm instructions. - for q_id in DICT_SAM_QUESTION_IDS.keys(): - # Look for all questions ("instructions") occurring in the dataframe. - actual_questions = df_esm_sam_unique_questions.loc[ - df_esm_sam_unique_questions["question_id"] == q_id, - "esm_instructions", - ] - # These are all answers to a given question (by q_id). - questions_matches = actual_questions.str.startswith( - DICT_SAM_QUESTION_IDS.get(q_id) - ) - # See if they are expected, i.e. included in the dictionary. - if ~actual_questions.all(): - print("One of the questions that occur in the data was undefined.") - print("This were the questions found in the data: ") - raise KeyError(actual_questions[~questions_matches]) - # In case there is an unexpected answer, raise an exception. - - # Next, replace question IDs. - df_sam_fixed = df_sam_cleaned.copy() - df_sam_fixed["question_id"] = df_sam_cleaned["esm_instructions"].apply( - lambda x: next( - ( - key - for key, values in DICT_SAM_QUESTION_IDS.items() - if x.startswith(values) - ), - None, - ) - ) - - return df_sam_fixed