diff --git a/features/esm.py b/features/esm.py index 5707936..2b9ba67 100644 --- a/features/esm.py +++ b/features/esm.py @@ -17,6 +17,12 @@ SESSION_STATUS_UNANSWERED = "ema_unanswered" SESSION_STATUS_DAY_FINISHED = "day_finished" SESSION_STATUS_COMPLETE = "ema_completed" +MAX_MORNING_LENGTH = 3 +# When the participants was not yet at work at the time of the first (morning) EMA, +# only three items were answered. +# Two sleep related items and one indicating NOT starting work yet. +# Daytime EMAs are all longer, in fact they always consist of at least 6 items. + def get_esm_data(usernames: Collection) -> pd.DataFrame: """ @@ -165,3 +171,41 @@ def classify_sessions_by_time(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame .first()[["time", "datetime_lj"]] ) return df_session_time + + +def classify_sessions_by_completion_time( + df_esm_preprocessed: pd.DataFrame, +) -> pd.DataFrame: + """ + The point of this function is to not only classify sessions by using the previously defined functions. + It also serves to "correct" the time type of some EMA sessions. + + A morning questionnaire could seamlessly transition into a daytime questionnaire, + if the participant was already at work. + In this case, the "time" label changed mid-session. + Because of the way classify_sessions_by_time works, this questionnaire was classified as "morning". + But for all intents and purposes, it can be treated as a "daytime" EMA. + + The way this scenario is differentiated from a true "morning" questionnaire, + where the participants NOT yet at work, is by considering their length. + + Parameters + ---------- + df_esm_preprocessed: pd.DataFrame + A preprocessed dataframe of esm data, which must include the session ID (esm_session). + + Returns + ------- + object + + """ + df_session_counts = classify_sessions_by_completion(df_esm_preprocessed) + df_session_time = classify_sessions_by_time(df_esm_preprocessed) + + df_session_counts_time = df_session_time.join(df_session_counts) + + df_session_counts_time.loc[ + df_session_counts_time.esm_session_count > MAX_MORNING_LENGTH, "time" + ] = "daytime" + + return df_session_counts_time