From e0da6757ecadc891504865efc5d3f56e9e920e75 Mon Sep 17 00:00:00 2001 From: junos Date: Sat, 3 Jul 2021 16:34:11 +0200 Subject: [PATCH] Clean up ESM by eliminating non-answers. Convert radio string answer to numeric. --- exploration/expl_esm_labels.py | 21 ++++++++++----------- features/esm.py | 20 +++++++++++++++++--- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/exploration/expl_esm_labels.py b/exploration/expl_esm_labels.py index beefd0e..01107c3 100644 --- a/exploration/expl_esm_labels.py +++ b/exploration/expl_esm_labels.py @@ -25,12 +25,6 @@ if nb_dir not in sys.path: import participants.query_db from features.esm import * -# %% [markdown] -# # ESM data - -# %% [markdown] -# Only take data from the main part of the study. The pilot data have different structure, there were especially many additions to ESM_JSON. - # %% participants_inactive_usernames = participants.query_db.get_usernames( collection_start=datetime.date.fromisoformat("2020-08-01") @@ -39,15 +33,20 @@ df_esm_inactive = get_esm_data(participants_inactive_usernames) # %% df_esm_preprocessed = preprocess_esm(df_esm_inactive) -df_esm_clean = clean_up_esm(df_esm_preprocessed) + +# %% [markdown] +# # PANAS # %% -df_esm_PANAS = df_esm_clean[ - (df_esm_clean["questionnaire_id"] == 8) | (df_esm_clean["questionnaire_id"] == 9) +df_esm_PANAS = df_esm_preprocessed[ + (df_esm_preprocessed["questionnaire_id"] == 8) + | (df_esm_preprocessed["questionnaire_id"] == 9) ] -df_esm_PANAS_grouped = df_esm_PANAS.groupby(["participant_id", "questionnaire_id"]) +df_esm_PANAS_clean = clean_up_esm(df_esm_PANAS) # %% -df_esm_PANAS.head() +df_esm_PANAS_grouped = df_esm_PANAS_clean.groupby( + ["participant_id", "questionnaire_id"] +) # %% diff --git a/features/esm.py b/features/esm.py index 3d9e878..e1cbf8b 100644 --- a/features/esm.py +++ b/features/esm.py @@ -17,6 +17,10 @@ SESSION_STATUS_UNANSWERED = "ema_unanswered" SESSION_STATUS_DAY_FINISHED = "day_finished" SESSION_STATUS_COMPLETE = "ema_completed" +ANSWER_DAY_FINISHED = "DayFinished3421" +ANSWER_DAY_OFF = "DayOff3421" +ANSWER_SET_EVENING = "DayFinishedSetEvening" + MAX_MORNING_LENGTH = 3 # When the participants was not yet at work at the time of the first (morning) EMA, # only three items were answered. @@ -119,10 +123,10 @@ def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.Dat # 2. Identify non-sessions, i.e. answers about the end of the day. non_session = sessions_grouped.apply( lambda x: ( - (x.esm_user_answer == "DayFinished3421") # I finished working for today. - | (x.esm_user_answer == "DayOff3421") # I am not going to work today. + (x.esm_user_answer == ANSWER_DAY_FINISHED) # I finished working for today. + | (x.esm_user_answer == ANSWER_DAY_OFF) # I am not going to work today. | ( - x.esm_user_answer == "DayFinishedSetEvening" + x.esm_user_answer == ANSWER_SET_EVENING ) # When would you like to answer the evening EMA? ).any() ) @@ -234,4 +238,14 @@ def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame: df_esm_clean = df_esm_preprocessed[ df_esm_preprocessed["esm_status"] == ESM_STATUS_ANSWERED ] + df_esm_clean = df_esm_clean[ + ~df_esm_clean["esm_user_answer"].isin( + [ANSWER_DAY_FINISHED, ANSWER_DAY_OFF, ANSWER_SET_EVENING] + ) + ] + df_esm_clean = df_esm_clean.assign( + esm_user_answer_numeric=lambda x: x.esm_user_answer.str.slice(stop=1).astype( + int + ) + ) return df_esm_clean