Extract method to reuse.

master
junos 2023-07-03 21:13:50 +02:00
parent c688580fe8
commit 82b53bc0d3
3 changed files with 66 additions and 126 deletions

View File

@ -347,3 +347,69 @@ def increment_answers(df_esm_clean: pd.DataFrame, increment_by=1):
print("Please, clean the dataframe first using features.esm.clean_up_esm.")
print(e)
return df_esm_clean
def reassign_question_ids(
df_esm_cleaned: pd.DataFrame, question_ids_content: dict
) -> pd.DataFrame:
"""
Fix question IDs to match their actual content.
Unfortunately, when altering the protocol to adapt to COVID pandemic,
we did not retain original question IDs.
This means that for participants before 2021, they are different
from for the rest of them.
This function searches for question IDs by matching their strings.
Parameters
----------
df_esm_cleaned: pd.DataFrame
A cleaned up dataframe, which must also include esm_user_answer_numeric.
question_ids_content: dict
A dictionary, linking question IDs with their content ("instructions").
Returns
-------
df_esm_fixed: pd.DataFrame
The same dataframe but with fixed question IDs.
"""
df_esm_unique_questions = (
df_esm_cleaned.groupby("question_id")
.esm_instructions.value_counts()
.rename()
.reset_index()
)
# Tabulate all possible answers to each question (group by question ID).
# First, check that we anticipated all esm instructions.
for q_id in question_ids_content.keys():
# Look for all questions ("instructions") occurring in the dataframe.
actual_questions = df_esm_unique_questions.loc[
df_esm_unique_questions["question_id"] == q_id,
"esm_instructions",
]
# These are all answers to a given question (by q_id).
questions_matches = actual_questions.str.startswith(
question_ids_content.get(q_id)
)
# See if they are expected, i.e. included in the dictionary.
if ~actual_questions.all():
print("One of the questions that occur in the data was undefined.")
print("This were the questions found in the data: ")
raise KeyError(actual_questions[~questions_matches])
# In case there is an unexpected answer, raise an exception.
# Next, replace question IDs.
df_esm_fixed = df_esm_cleaned.copy()
df_esm_fixed["question_id"] = df_esm_cleaned["esm_instructions"].apply(
lambda x: next(
(
key
for key, values in question_ids_content.items()
if x.startswith(values)
),
None,
)
)
return df_esm_fixed

View File

@ -1,5 +1,3 @@
import pandas as pd
COPE_ORIGINAL_MAX = 4
COPE_ORIGINAL_MIN = 1
@ -125,65 +123,3 @@ DICT_COPE_QUESTION_IDS = {
"Razburil sem se in razmišljal samo o tem",
),
}
def reassign_question_ids(df_cope_cleaned: pd.DataFrame) -> pd.DataFrame:
"""
Fix question IDs to match their actual content.
Unfortunately, when altering the protocol to adapt to COVID pandemic,
we did not retain original question IDs.
This means that for participants before 2021, they are different
from for the rest of them.
This function searches for question IDs by matching their strings.
Parameters
----------
df_cope_cleaned: pd.DataFrame
A cleaned up dataframe, which must also include esm_user_answer_numeric.
Returns
-------
df_cope_fixed: pd.DataFrame
The same dataframe but with fixed question IDs.
"""
df_esm_cope_unique_questions = (
df_cope_cleaned.groupby("question_id")
.esm_instructions.value_counts()
.rename()
.reset_index()
)
# Tabulate all possible answers to each question (group by question ID).
# First, check that we anticipated all esm instructions.
for q_id in DICT_COPE_QUESTION_IDS.keys():
# Look for all questions ("instructions") occurring in the dataframe.
actual_questions = df_esm_cope_unique_questions.loc[
df_esm_cope_unique_questions["question_id"] == q_id,
"esm_instructions",
]
# These are all answers to a given question (by q_id).
questions_matches = actual_questions.str.startswith(
DICT_COPE_QUESTION_IDS.get(q_id)
)
# See if they are expected, i.e. included in the dictionary.
if ~actual_questions.all():
print("One of the questions that occur in the data was undefined.")
print("This were the questions found in the data: ")
raise KeyError(actual_questions[~questions_matches])
# In case there is an unexpected answer, raise an exception.
# Next, replace question IDs.
df_cope_fixed = df_cope_cleaned.copy()
df_cope_fixed["question_id"] = df_cope_cleaned["esm_instructions"].apply(
lambda x: next(
(
key
for key, values in DICT_COPE_QUESTION_IDS.items()
if x.startswith(values)
),
None,
)
)
return df_cope_fixed

View File

@ -444,65 +444,3 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
# TODO: How many questions about the stressfulness of the period were asked
# and how does this relate to events?
def reassign_question_ids(df_sam_cleaned: pd.DataFrame) -> pd.DataFrame:
"""
Fix question IDs to match their actual content.
Unfortunately, when altering the protocol to adapt to COVID pandemic,
we did not retain original question IDs.
This means that for participants before 2021, they are different
from for the rest of them.
This function searches for question IDs by matching their strings.
Parameters
----------
df_sam_cleaned: pd.DataFrame
A cleaned up dataframe, which must also include esm_user_answer_numeric.
Returns
-------
df_sam_fixed: pd.DataFrame
The same dataframe but with fixed question IDs.
"""
df_esm_sam_unique_questions = (
df_sam_cleaned.groupby("question_id")
.esm_instructions.value_counts()
.rename()
.reset_index()
)
# Tabulate all possible answers to each question (group by question ID).
# First, check that we anticipated all esm instructions.
for q_id in DICT_SAM_QUESTION_IDS.keys():
# Look for all questions ("instructions") occurring in the dataframe.
actual_questions = df_esm_sam_unique_questions.loc[
df_esm_sam_unique_questions["question_id"] == q_id,
"esm_instructions",
]
# These are all answers to a given question (by q_id).
questions_matches = actual_questions.str.startswith(
DICT_SAM_QUESTION_IDS.get(q_id)
)
# See if they are expected, i.e. included in the dictionary.
if ~actual_questions.all():
print("One of the questions that occur in the data was undefined.")
print("This were the questions found in the data: ")
raise KeyError(actual_questions[~questions_matches])
# In case there is an unexpected answer, raise an exception.
# Next, replace question IDs.
df_sam_fixed = df_sam_cleaned.copy()
df_sam_fixed["question_id"] = df_sam_cleaned["esm_instructions"].apply(
lambda x: next(
(
key
for key, values in DICT_SAM_QUESTION_IDS.items()
if x.startswith(values)
),
None,
)
)
return df_sam_fixed