13 changed files with 47 additions and 838 deletions
--- a/config/environment.yml
+++ b/config/environment.yml
@ -16,7 +16,6 @@ dependencies:
  - python-dotenv
  - pytz
  - seaborn
  - scikit-learn
  - sqlalchemy
  - statsmodels
  - tabulate
--- a/exploration/ex_ml_pipeline.py
+++ b/exploration/ex_ml_pipeline.py
@ -1,150 +0,0 @@
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:percent
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.11.4
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
 # %%
 # %matplotlib inline
 import datetime
 import os
 import sys
 import seaborn as sns
 from sklearn import linear_model
 from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 # %%
 import participants.query_db
 from features import esm, helper, proximity
 # %% [markdown]
 # # 1. Get the relevant data
 # %%
 participants_inactive_usernames = participants.query_db.get_usernames(
    collection_start=datetime.date.fromisoformat("2020-08-01")
 )
 # Consider only two participants to simplify.
 ptcp_2 = participants_inactive_usernames[0:2]
 # %% [markdown]
 # ## 1.1 Labels
 # %%
 df_esm = esm.get_esm_data(ptcp_2)
 df_esm_preprocessed = esm.preprocess_esm(df_esm)
 # %%
 df_esm_PANAS = df_esm_preprocessed[
    (df_esm_preprocessed["questionnaire_id"] == 8)
    | (df_esm_preprocessed["questionnaire_id"] == 9)
 ]
 df_esm_PANAS_clean = esm.clean_up_esm(df_esm_PANAS)
 # %% [markdown]
 # ## 1.2 Sensor data
 # %%
 df_proximity = proximity.get_proximity_data(ptcp_2)
 df_proximity = helper.get_date_from_timestamp(df_proximity)
 df_proximity = proximity.recode_proximity(df_proximity)
 # %% [markdown]
 # ## 1.3 Standardization/personalization
 # %% [markdown]
 # # 2. Grouping/segmentation
 # %%
 df_esm_PANAS_daily_means = (
    df_esm_PANAS_clean.groupby(["participant_id", "date_lj", "questionnaire_id"])
    .esm_user_answer_numeric.agg("mean")
    .reset_index()
    .rename(columns={"esm_user_answer_numeric": "esm_numeric_mean"})
 )
 # %%
 df_esm_PANAS_daily_means = (
    df_esm_PANAS_daily_means.pivot(
        index=["participant_id", "date_lj"],
        columns="questionnaire_id",
        values="esm_numeric_mean",
    )
    .reset_index(col_level=1)
    .rename(columns={8.0: "PA", 9.0: "NA"})
    .set_index(["participant_id", "date_lj"])
 )
 # %%
 df_proximity_daily_counts = proximity.count_proximity(
    df_proximity, ["participant_id", "date_lj"]
 )
 # %%
 df_proximity_daily_counts
 # %% [markdown]
 # # 3. Join features (and export to csv?)
 # %%
 df_full_data_daily_means = df_esm_PANAS_daily_means.join(
    df_proximity_daily_counts
 ).reset_index()
 # %% [markdown]
 # # 4. Machine learning model and parameters
 # %%
 lin_reg_proximity = linear_model.LinearRegression()
 # %% [markdown]
 # ## 4.1 Validation method
 # %%
 logo = LeaveOneGroupOut()
 logo.get_n_splits(
    df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
    df_full_data_daily_means["PA"],
    groups=df_full_data_daily_means["participant_id"],
 )
 # %% [markdown]
 # ## 4.2 Fit results (export?)
 # %%
 cross_val_score(
    lin_reg_proximity,
    df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
    df_full_data_daily_means["PA"],
    groups=df_full_data_daily_means["participant_id"],
    cv=logo,
    n_jobs=-1,
    scoring="r2",
 )
 # %%
 lin_reg_proximity.fit(
    df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
    df_full_data_daily_means["PA"],
 )
 # %%
 lin_reg_proximity.score(
    df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
    df_full_data_daily_means["PA"],
 )
--- a/exploration/expl_app_categories.py
+++ b/exploration/expl_app_categories.py
@ -1,76 +0,0 @@
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:percent
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.11.4
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
 # %%
 # %matplotlib inline
 import os
 import sys
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 # %%
 from config.models import AppCategories, Participant
 from setup import db_engine, session
 # %%
 query_app_categories = session.query(AppCategories)
 with db_engine.connect() as connection:
    df_app_categories = pd.read_sql(query_app_categories.statement, connection)
 # %%
 df_app_categories.head()
 # %%
 df_app_categories["play_store_genre"].value_counts()
 # %%
 df_category_not_found = df_app_categories[
    df_app_categories["play_store_genre"] == "not_found"
 ]
 # %%
 df_category_not_found["play_store_response"].value_counts()
 # %%
 df_category_not_found["package_name"].value_counts()
 # %%
 manufacturers = [
    "samsung",
    "oneplus",
    "huawei",
    "xiaomi",
    "lge",
    "motorola",
    "miui",
    "lenovo",
    "oppo",
    "mediatek",
 ]
 custom_rom = ["coloros", "lineageos", "myos", "cyanogenmod", "foundation.e"]
 other = ["android", "wssyncmldm"]
 rows_os_manufacturer = df_category_not_found["package_name"].str.contains(
    "|".join(manufacturers + custom_rom + other), case=False
 )
 # %%
 with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(df_category_not_found.loc[~rows_os_manufacturer])
--- a/exploration/expl_communication.py
+++ b/exploration/expl_communication.py
@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.11.4
+#       jupytext_version: 1.11.2
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -14,7 +14,6 @@
 # ---
 # %%
 # %matplotlib inline
 import os
 import sys
@ -54,15 +53,6 @@ import participants.query_db
 participants_inactive_usernames = participants.query_db.get_usernames()
 df_calls_inactive = get_call_data(participants_inactive_usernames)
 # %%
 participants_inactive_usernames
 # %%
 df_calls_inactive.head()
 # %%
 enumerate_contacts(df_calls_inactive).head()
 # %%
 df_calls_features = count_comms(df_calls_inactive)
 df_calls_features.head()
@ -80,9 +70,6 @@ calls_number = pd.wide_to_long(
    suffix="\D+",
 )
 # %%
 calls_number
 # %%
 sns.displot(calls_number, x="no", hue="call_type", binwidth=5, element="step", height=8)
@ -139,30 +126,3 @@ sms_number = pd.wide_to_long(
 sns.displot(
    sms_number, x="no", hue="message_type", binwidth=5, element="step", height=8
 )
 # %% [markdown]
 # # Communication features
 # %%
 df_calls_enumerated = enumerate_contacts(df_calls)
 display(df_calls_enumerated)
 # %%
 df_calls_contact_features = contact_features(df_calls_enumerated)
 display(df_calls_contact_features)
 # %%
 df_sms_enumerated = enumerate_contacts(df_sms)
 df_sms_contact_features = contact_features(df_sms_enumerated)
 display(df_sms_contact_features)
 # %%
 display(count_comms(df_calls))
 # %%
 display(count_comms(df_sms))
 # %%
 display(calls_sms_features(df_calls, df_sms))
 # %%
--- a/exploration/expl_esm_adherence.py
+++ b/exploration/expl_esm_adherence.py
@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.11.4
+#       jupytext_version: 1.11.2
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -14,7 +14,6 @@
 # ---
 # %%
 # %matplotlib inline
 import os
 import sys
--- a/features/communication.py
+++ b/features/communication.py
@ -86,8 +86,7 @@ def enumerate_contacts(comm_df: pd.DataFrame) -> pd.DataFrame:
    # In other words, recode the contacts into integers from 0 to n_contacts,
    # so that the first one is contacted the most often.
    contact_ids = (
-        # Group again for enumeration.
+        contact_counts.groupby("participant_id")  # Group again for enumeration.
        contact_counts.groupby("participant_id")
        .cumcount()  # Enumerate (count) rows *within* participants.
        .to_frame("contact_id")
    )
@ -177,148 +176,15 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame:
    return comm_features
-def contact_features(df_enumerated: pd.DataFrame) -> pd.DataFrame:
+def contact_features():
-    """
+    # TODO Implement a method that takes a DF with enumerated contacts as argument and calculates:
-    Counts the number of people contacted (for each participant) and, if
+    # * Duration of calls per caller (for most common callers)
-    df_enumerated is a dataframe containing calls data, the total duration
+    # * Determine work vs non-work contacts by work hours heuristics
-    of calls between a participant and each of her contacts. 
+    # * Number of people contacted
-
+    # And similarly for SMS.
-    Parameters
+    pass
    ----------
    df_enumerated: pd.DataFrame
        A dataframe of calls or SMSes; return of function enumerate_contacts.
    Returns
    -------
    comm_df: pd.DataFrame
        The altered dataframe with the column no_contacts and, if df_enumerated
        contains calls data, an additional column total_call_duration.
    """
    # Check whether df contains calls or SMS data since some
    # features we want to calculate are type-specyfic
    if "call_duration" in df_enumerated:
        # Add a column with the total duration of calls between two people
        duration_count = (
            df_enumerated.groupby(["participant_id", "contact_id"])
            # For each participant and for each caller, sum durations of their calls
            ["call_duration"]
            .sum()
            .reset_index()  # Make index (which is actually the participant id) a normal column
            .rename(columns={"call_duration": "total_call_duration"})
        )
        # The new dataframe now contains columns containing information about
        # participants, callers and the total duration of their calls. All that
        # is now left to do is to merge the original df with the new one.
        df_enumerated = df_enumerated.merge(
            duration_count, on=["participant_id", "contact_id"]
        )
    contact_count = (
        df_enumerated.groupby(["participant_id"])
        .nunique()[
            "contact_id"
        ]  # For each participant, count the number of distinct contacts
        .reset_index()  # Make index (which is actually the participant id) a normal column
        .rename(columns={"contact_id": "no_contacts"})
    )
    df_enumerated = (
        # Merge df with the newely created df containing info about number of contacts
        df_enumerated.merge(contact_count, on="participant_id")
        # Sort first by participant_id and then by contact_id and
        # thereby restore the inital ordering of input dataframes.
        .sort_values(["participant_id", "contact_id"])
    )
    # TODO:Determine work vs non-work contacts by work hours heuristics
    return df_enumerated
-def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataFrame:
+def calls_sms_features():
-    """
+    # TODO Relate the calls and sms data, such as comparing the number of (missed) calls and messages.
-    Calculates additional features relating calls and sms data.
+    pass
    Parameters
    ----------
    df_calls: pd.DataFrame
        A dataframe of calls (return of get_call_data).
    df_sms: pd.DataFrame
        A dataframe of calls (return of get_sms_data).
    Returns
    -------
    df_calls_sms: pd.DataFrame
        The list of features relating calls and sms data for every participant.
        These are:
        * proportion_calls:
            proportion of calls in total number of communications
        * proportion_calls_incoming:
            proportion of incoming calls in total number of incoming/received communications
        * proportion_calls_outgoing:
            proportion of outgoing calls in total number of outgoing/sent communications
        * proportion_calls_missed_sms_received:
            proportion of missed calls to the number of received messages
        * proportion_calls_contacts:
            proportion of calls contacts in total number of communication contacts
    """
    count_calls = count_comms(df_calls)
    count_sms = count_comms(df_sms)
    count_joined = (
        count_calls.merge(
            count_sms, on="participant_id", suffixes=("_calls", "_sms")
        )  # Merge calls and sms features
        .reset_index()  # Make participant_id a regular column
        .assign(
            proportion_calls=(
                lambda x: x.no_all_calls / (x.no_all_calls + x.no_all_sms)
            ),
            proportion_calls_incoming=(
                lambda x: x.no_incoming / (x.no_incoming + x.no_received)
            ),
            proportion_calls_missed_sms_received=(
                lambda x: x.no_missed / (x.no_missed + x.no_received)
            ),
            proportion_calls_outgoing=(
                lambda x: x.no_outgoing / (x.no_outgoing + x.no_sent)
            )
            # Calculate new features and create additional columns
        )[
            [
                "participant_id",
                "proportion_calls",
                "proportion_calls_incoming",
                "proportion_calls_outgoing",
                "proportion_calls_missed_sms_received",
            ]
        ]  # Filter out only the relevant features
    )
    features_calls = contact_features(enumerate_contacts(df_calls))
    features_sms = contact_features(enumerate_contacts(df_sms))
    features_joined = (
        features_calls.merge(
            features_sms, on="participant_id", suffixes=("_calls", "_sms")
        )  # Merge calls and sms features
        .reset_index()  # Make participant_id a regular column
        .assign(
            proportion_calls_contacts=(
                lambda x: x.no_contacts_calls
                / (x.no_contacts_calls + x.no_contacts_sms)
            )  # Calculate new features and create additional columns
        )[
            ["participant_id", "proportion_calls_contacts"]
        ]  # Filter out only the relevant features
        # Since we are interested only in some features and ignored
        # others, a lot of duplicate rows were created. Remove them.
        .drop_duplicates()
    )
    # Join the newly created dataframes
    df_calls_sms = count_joined.merge(features_joined, on="participant_id")
    return df_calls_sms
--- a/features/esm.py
+++ b/features/esm.py
@ -1,12 +1,14 @@
 import datetime
 from collections.abc import Collection
 import numpy as np
 import pandas as pd
 from pytz import timezone
 from config.models import ESM, Participant
 from features import helper
 from setup import db_engine, session
 TZ_LJ = timezone("Europe/Ljubljana")
 ESM_STATUS_ANSWERED = 2
 GROUP_SESSIONS_BY = ["participant_id", "device_id", "esm_session"]
@ -65,8 +67,14 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
    df_esm_preprocessed: pd.DataFrame
        A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
    """
-    df_esm = helper.get_date_from_timestamp(df_esm)
+    df_esm["datetime_lj"] = df_esm["double_esm_user_answer_timestamp"].apply(
-
+        lambda x: datetime.datetime.fromtimestamp(x / 1000.0, tz=TZ_LJ)
    )
    df_esm = df_esm.assign(
        date_lj=lambda x: (x.datetime_lj - datetime.timedelta(hours=4)).dt.date
    )
    # Since daytime EMAs could *theoretically* last beyond midnight, but never after 4 AM,
    # the datetime is first translated to 4 h earlier.
    df_esm_json = pd.json_normalize(df_esm["esm_json"]).drop(
        columns=["esm_trigger"]
    )  # The esm_trigger column is already present in the main df.
@ -248,9 +256,9 @@ def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
        ESM.ESM_TYPE.get("scale"),
        ESM.ESM_TYPE.get("number"),
    ]
-    df_esm_clean.loc[
+    df_esm_clean[df_esm_clean["esm_type"].isin(esm_type_numeric)] = df_esm_clean[
        df_esm_clean["esm_type"].isin(esm_type_numeric)
-    ] = df_esm_clean.loc[df_esm_clean["esm_type"].isin(esm_type_numeric)].assign(
+    ].assign(
        esm_user_answer_numeric=lambda x: x.esm_user_answer.str.slice(stop=1).astype(
            int
        )
--- a/features/esm_SAM.py
+++ b/features/esm_SAM.py
@ -1,267 +0,0 @@
 import numpy as np
 import pandas as pd
 import features.esm
 QUESTIONNAIRE_ID_SAM = {
    "event_stress": 87,
    "event_threat": 88,
    "event_challenge": 89,
    "event_time": 90,
    "event_duration": 91,
    "event_work_related": 92,
    "period_stress": 93,
 }
 QUESTIONNAIRE_ID_SAM_LOW = min(QUESTIONNAIRE_ID_SAM.values())
 QUESTIONNAIRE_ID_SAM_HIGH = max(QUESTIONNAIRE_ID_SAM.values())
 GROUP_QUESTIONNAIRES_BY = [
    "participant_id",
    "device_id",
    "esm_session",
 ]
 # Each questionnaire occurs only once within each esm_session on the same device within the same participant.
 def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:
    # 0. Select only questions from Stress Appraisal Measure.
    df_esm_preprocessed = features.esm.preprocess_esm(df_esm)
    df_esm_sam = df_esm_preprocessed[
        (df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_ID_SAM_LOW)
        & (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_ID_SAM_HIGH)
    ]
    df_esm_sam_clean = features.esm.clean_up_esm(df_esm_sam)
    # 1.
    df_esm_event_threat_challenge_mean_wide = calculate_threat_challenge_means(
        df_esm_sam_clean
    )
    # 2.
    df_esm_event_stress = detect_stressful_event(df_esm_sam_clean)
    # Join to the previously calculated features related to the events.
    df_esm_events = df_esm_event_threat_challenge_mean_wide.join(
        df_esm_event_stress[
            GROUP_QUESTIONNAIRES_BY + ["event_present", "event_stressfulness"]
        ].set_index(GROUP_QUESTIONNAIRES_BY)
    )
    # 3.
    df_esm_event_work_related = detect_event_work_related(df_esm_sam_clean)
    df_esm_events = df_esm_events.join(
        df_esm_event_work_related[
            GROUP_QUESTIONNAIRES_BY + ["event_work_related"]
        ].set_index(GROUP_QUESTIONNAIRES_BY)
    )
    # 4.
    df_esm_event_time = convert_event_time(df_esm_sam_clean)
    df_esm_events = df_esm_events.join(
        df_esm_event_time[GROUP_QUESTIONNAIRES_BY + ["event_time"]].set_index(
            GROUP_QUESTIONNAIRES_BY
        )
    )
    # 5.
    df_esm_event_duration = extract_event_duration(df_esm_sam_clean)
    df_esm_events = df_esm_events.join(
        df_esm_event_duration[
            GROUP_QUESTIONNAIRES_BY + ["event_duration", "event_duration_info"]
        ].set_index(GROUP_QUESTIONNAIRES_BY)
    )
    return df_esm_events
 def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    """
    This function calculates challenge and threat (two Stress Appraisal Measure subscales) means,
        for each ESM session (within participants and devices).
    It creates a grouped dataframe with means in two columns.
    Parameters
    ----------
    df_esm_sam_clean: pd.DataFrame
        A cleaned up dataframe of Stress Appraisal Measure items.
    Returns
    -------
    df_esm_event_threat_challenge_mean_wide: pd.DataFrame
        A dataframe of unique ESM sessions (by participants and devices) with threat and challenge means.
    """
    # Select only threat and challenge assessments for events
    df_esm_event_threat_challenge = df_esm_sam_clean[
        (
            df_esm_sam_clean["questionnaire_id"]
            == QUESTIONNAIRE_ID_SAM.get("event_threat")
        )
        | (
            df_esm_sam_clean["questionnaire_id"]
            == QUESTIONNAIRE_ID_SAM.get("event_challenge")
        )
    ]
    # Calculate mean of threat and challenge subscales for each ESM session.
    df_esm_event_threat_challenge_mean_wide = pd.pivot_table(
        df_esm_event_threat_challenge,
        index=["participant_id", "device_id", "esm_session"],
        columns=["questionnaire_id"],
        values=["esm_user_answer_numeric"],
        aggfunc="mean",
    )
    # Drop unnecessary column values.
    df_esm_event_threat_challenge_mean_wide.columns = df_esm_event_threat_challenge_mean_wide.columns.get_level_values(
        1
    )
    df_esm_event_threat_challenge_mean_wide.columns.name = None
    df_esm_event_threat_challenge_mean_wide.rename(
        columns={
            QUESTIONNAIRE_ID_SAM.get("event_threat"): "threat_mean",
            QUESTIONNAIRE_ID_SAM.get("event_challenge"): "challenge_mean",
        },
        inplace=True,
    )
    return df_esm_event_threat_challenge_mean_wide
 def detect_stressful_event(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    """
    Participants were asked: "Was there a particular event that created tension in you?"
    The following options were available:
        0 - No,
        1 - Yes, slightly,
        2 - Yes, moderately,
        3 - Yes, considerably,
        4 - Yes, extremely.
    This function indicates whether there was a stressful event (True/False)
        and how stressful it was on a scale of 1 to 4.
    Parameters
    ----------
    df_esm_sam_clean: pd.DataFrame
        A cleaned up dataframe of Stress Appraisal Measure items.
    Returns
    -------
    df_esm_event_stress: pd.DataFrame
        The same dataframe with two new columns:
            - event_present, indicating whether there was a stressful event at all,
            - event_stressfulness, a numeric answer (1-4) to the single item question.
    """
    df_esm_event_stress = df_esm_sam_clean[
        df_esm_sam_clean["questionnaire_id"] == QUESTIONNAIRE_ID_SAM.get("event_stress")
    ]
    df_esm_event_stress = df_esm_event_stress.assign(
        event_present=lambda x: x.esm_user_answer_numeric > 0,
        event_stressfulness=lambda x: x.esm_user_answer_numeric,
    )
    return df_esm_event_stress
 def detect_event_work_related(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    """
    This function simply adds a column indicating the answer to the question:
        "Was/is this event work-related?"
    Parameters
    ----------
    df_esm_sam_clean: pd.DataFrame
        A cleaned up dataframe of Stress Appraisal Measure items.
    Returns
    -------
    df_esm_event_stress: pd.DataFrame
        The same dataframe with a new column event_work_related (True/False).
    """
    df_esm_event_stress = df_esm_sam_clean[
        df_esm_sam_clean["questionnaire_id"]
        == QUESTIONNAIRE_ID_SAM.get("event_work_related")
    ]
    df_esm_event_stress = df_esm_event_stress.assign(
        event_work_related=lambda x: x.esm_user_answer_numeric > 0
    )
    return df_esm_event_stress
 def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    """
    This function only serves to convert the string datetime answer into a real datetime type.
    Errors during this conversion are coerced, meaning that non-datetime answers are assigned Not a Time (NaT).
    NOTE: Since the only available non-datetime answer to this question was "0 - I do not remember",
        the NaTs can be interpreted to mean this.
    Parameters
    ----------
    df_esm_sam_clean: pd.DataFrame
        A cleaned up dataframe of Stress Appraisal Measure items.
    Returns
    -------
    df_esm_event_time: pd.DataFrame
        The same dataframe with a new column event_time of datetime type.
    """
    df_esm_event_time = df_esm_sam_clean[
        df_esm_sam_clean["questionnaire_id"] == QUESTIONNAIRE_ID_SAM.get("event_time")
    ].assign(
        event_time=lambda x: pd.to_datetime(
            x.esm_user_answer, errors="coerce", infer_datetime_format=True, exact=True
        )
    )
    return df_esm_event_time
 def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    """
    If participants indicated a stressful events, they were asked:
        "How long did this event last? (Answer in hours and minutes)"
    This function extracts this duration time and saves additional answers:
        0 - I do not remember,
        1 - It is still going on.
    Parameters
    ----------
    df_esm_sam_clean: pd.DataFrame
        A cleaned up dataframe of Stress Appraisal Measure items.
    Returns
    -------
    df_esm_event_duration: pd.DataFrame
        The same dataframe with two new columns:
            - event_duration, a time part of a datetime,
            - event_duration_info, giving other options to this question:
                0 - I do not remember,
                1 - It is still going on
    """
    df_esm_event_duration = df_esm_sam_clean[
        df_esm_sam_clean["questionnaire_id"]
        == QUESTIONNAIRE_ID_SAM.get("event_duration")
    ].assign(
        event_duration=lambda x: pd.to_datetime(
            x.esm_user_answer.str.slice(start=0, stop=-6), errors="coerce"
        ).dt.time
    )
    # TODO Explore the values recorded in event_duration and possibly fix mistakes.
    # For example, participants reported setting 23:50:00 instead of 00:50:00.
    # For the events that no duration was found (i.e. event_duration = NaT),
    # we can determine whether:
    #   - this event is still going on ("1 - It is still going on")
    #   - the participant couldn't remember it's duration ("0 - I do not remember")
    # Generally, these answers were converted to esm_user_answer_numeric in clean_up_esm,
    # but only the numeric types of questions and answers.
    # Since this was of "datetime" type, convert these specific answers here again.
    df_esm_event_duration["event_duration_info"] = np.nan
    df_esm_event_duration[
        df_esm_event_duration.event_duration.isna()
    ] = df_esm_event_duration[df_esm_event_duration.event_duration.isna()].assign(
        event_duration_info=lambda x: x.esm_user_answer.str.slice(stop=1).astype(int)
    )
    return df_esm_event_duration
 # TODO: How many questions about the stressfulness of the period were asked and how does this relate to events?
--- a/features/helper.py
+++ b/features/helper.py
@ -1,41 +0,0 @@
 import datetime
 import pandas as pd
 from pytz import timezone
 TZ_LJ = timezone("Europe/Ljubljana")
 COLUMN_TIMESTAMP = "timestamp"
 COLUMN_TIMESTAMP_ESM = "double_esm_user_answer_timestamp"
 def get_date_from_timestamp(df_aware) -> pd.DataFrame:
    """
    Transform a UNIX timestamp into a datetime (with Ljubljana timezone).
    Additionally, extract only the date part, where anything until 4 AM is considered the same day.
    Parameters
    ----------
    df_aware: pd.DataFrame
        Any AWARE-type data as defined in models.py.
    Returns
    -------
    df_aware: pd.DataFrame
        The same dataframe with datetime_lj and date_lj columns added.
    """
    if COLUMN_TIMESTAMP_ESM in df_aware:
        column_timestamp = COLUMN_TIMESTAMP_ESM
    else:
        column_timestamp = COLUMN_TIMESTAMP
    df_aware["datetime_lj"] = df_aware[column_timestamp].apply(
        lambda x: datetime.datetime.fromtimestamp(x / 1000.0, tz=TZ_LJ)
    )
    df_aware = df_aware.assign(
        date_lj=lambda x: (x.datetime_lj - datetime.timedelta(hours=4)).dt.date
    )
    # Since daytime EMAs could *theoretically* last beyond midnight, but never after 4 AM,
    # the datetime is first translated to 4 h earlier.
    return df_aware
--- a/features/proximity.py
+++ b/features/proximity.py
@ -28,63 +28,3 @@ def get_proximity_data(usernames: Collection) -> pd.DataFrame:
    with db_engine.connect() as connection:
        df_proximity = pd.read_sql(query_proximity.statement, connection)
    return df_proximity
 def recode_proximity(df_proximity: pd.DataFrame) -> pd.DataFrame:
    """
    This function recodes proximity from a double to a boolean value.
    Different proximity sensors report different values,
        but in our data only several distinct values have ever been found.
    These are therefore converted into "near" and "far" binary values.
    See expl_proximity.ipynb for additional info.
    Parameters
    ----------
    df_proximity: pd.DataFrame
        A dataframe of proximity data.
    Returns
    -------
    df_proximity: pd.DataFrame
        The same dataframe with an additional column bool_prox_near,
            indicating whether "near" proximity was reported.
        False values correspond to "far" reported by this sensor.
    """
    df_proximity = df_proximity.assign(bool_prox_near=lambda x: x.double_proximity == 0)
    return df_proximity
 def count_proximity(
    df_proximity: pd.DataFrame, group_by: Collection = ["participant_id"]
 ) -> pd.DataFrame:
    """
    The function counts how many times a "near" value occurs in proximity
        and calculates the proportion of this counts to all proximity values (i.e. relative count).
    Parameters
    ----------
    df_proximity: pd.DataFrame
        A dataframe of proximity data.
    group_by: Collection
        A list of strings, specifying by which parameters to group.
        By default, the features are calculated per participant, but could be "date_lj" etc.
    Returns
    -------
    df_proximity_features: pd.DataFrame
        A dataframe with the count of "near" proximity values and their relative count.
    """
    if "bool_prox_near" not in df_proximity:
        df_proximity = recode_proximity(df_proximity)
    df_proximity["bool_prox_far"] = ~df_proximity["bool_prox_near"]
    df_proximity_features = df_proximity.groupby(group_by).sum()[
        ["bool_prox_near", "bool_prox_far"]
    ]
    df_proximity_features = df_proximity_features.assign(
        prop_prox_near=lambda x: x.bool_prox_near / (x.bool_prox_near + x.bool_prox_far)
    )
    df_proximity_features = df_proximity_features.rename(
        columns={"bool_prox_near": "freq_prox_near"}
    ).drop(columns="bool_prox_far", inplace=False)
    return df_proximity_features
--- a/machine_learning/init.py
+++ b/machine_learning/init.py
--- a/statistical_analysis/adherence.py
+++ b/statistical_analysis/adherence.py
@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.11.4
+#       jupytext_version: 1.11.2
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -14,12 +14,12 @@
 # ---
 # %%
 # %matplotlib inline
 import datetime
 # %%
 import os
 import sys
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 import statsmodels.api as sm
@ -31,24 +31,6 @@ if nb_dir not in sys.path:
 import participants.query_db
 from features.esm import *
 # %%
 SAVE_FIGS = True
 FIG_HEIGHT = 5
 FIG_ASPECT = 1.7
 FIG_COLOUR = "#28827C"
 SMALL_SIZE = 14
 MEDIUM_SIZE = SMALL_SIZE + 2
 BIGGER_SIZE = MEDIUM_SIZE + 2
 plt.rc("font", size=SMALL_SIZE)  # controls default text sizes
 plt.rc("axes", titlesize=SMALL_SIZE)  # fontsize of the axes title
 plt.rc("axes", labelsize=MEDIUM_SIZE)  # fontsize of the x and y labels
 plt.rc("xtick", labelsize=SMALL_SIZE)  # fontsize of the tick labels
 plt.rc("ytick", labelsize=SMALL_SIZE)  # fontsize of the tick labels
 plt.rc("legend", fontsize=SMALL_SIZE)  # legend fontsize
 plt.rc("figure", titlesize=BIGGER_SIZE)  # fontsize of the figure title
 # %%
 baseline_si = pd.read_csv("E:/STRAWbaseline/results-survey637813.csv")
 baseline_be_1 = pd.read_csv("E:/STRAWbaseline/results-survey358134.csv")
@ -148,7 +130,7 @@ df_adherence.describe()
 df_adherence[["gender", "startlanguage"]].value_counts()
 # %%
-sns.displot(df_adherence["finished_sessions"], binwidth=5, height=FIG_HEIGHT)
+sns.displot(df_adherence["finished_sessions"], binwidth=5, height=5)
 # %%
 lm_adherence = smf.ols(
@ -242,14 +224,12 @@ df_session_workday = df_session_workday.assign(
 g1 = sns.displot(
    df_session_workday["time_diff_minutes"],
    binwidth=5,
-    height=FIG_HEIGHT,
+    height=5,
-    aspect=FIG_ASPECT,
+    aspect=1.5,
-    color=FIG_COLOUR,
+    color="#28827C",
 )
 g1.set_axis_labels("Time difference [min]", "Session count")
-g1.set(xlim=(0, 570))
+# g1.savefig("WorkdayEMAtimeDiff.pdf")
 if SAVE_FIGS:
    g1.savefig("WorkdayEMAtimeDiff.pdf")
 # %% [markdown]
 # There are some sessions that are really close together. By design, none should be closer than 30 min. Let's take a look at those.
@ -316,13 +296,12 @@ df_mean_daytime_interval.describe()
 g2 = sns.displot(
    df_mean_daytime_interval.time_diff_minutes,
    binwidth=5,
-    height=FIG_HEIGHT,
+    height=5,
-    aspect=FIG_ASPECT,
+    aspect=1.5,
-    color=FIG_COLOUR,
+    color="#28827C",
 )
 g2.set_axis_labels("Median time difference [min]", "Participant count")
-if SAVE_FIGS:
+# g2.savefig("WorkdayEMAtimeDiffMedianParticip.pdf")
    g2.savefig("WorkdayEMAtimeDiffMedianParticip.pdf")
 # %%
 df_adherence = df_adherence.merge(
@ -348,9 +327,9 @@ df_count_daytime_per_participant["time"].describe()
 sns.displot(
    df_count_daytime_per_participant.time,
    binwidth=1,
-    height=FIG_HEIGHT,
+    height=5,
-    aspect=FIG_ASPECT,
+    aspect=1.5,
-    color=FIG_COLOUR,
+    color="#28827C",
 )
 # %% [markdown]
@ -385,14 +364,13 @@ s_evening_completed_ratio.describe()
 g3 = sns.displot(
    s_evening_completed_ratio - 0.001,
    binwidth=0.05,
-    height=FIG_HEIGHT,
+    height=5,
-    aspect=FIG_ASPECT,
+    aspect=1.5,
-    color=FIG_COLOUR,
+    color="#28827C",
 )
 g3.set_axis_labels("Ratio of days with the evening EMA filled out", "Participant count")
 g3.set(xlim=(1.01, 0.59))
-if SAVE_FIGS:
+# g3.savefig("EveningEMAratioParticip.pdf")
    g3.savefig("EveningEMAratioParticip.pdf")
 # %%
 df_adherence = df_adherence.merge(
@ -408,3 +386,5 @@ lr_ols_evening_ratio = smf.ols(
 )
 ls_result_evening_ratio = lr_ols_evening_ratio.fit()
 ls_result_evening_ratio.summary()
 # %%
--- a/test/test_esm.py
+++ b/test/test_esm.py
@ -16,16 +16,7 @@ class EsmFeatures(unittest.TestCase):
    def test_preprocess_esm(self):
        self.esm_processed = preprocess_esm(self.esm)
        # Check for columns which should have been extracted from esm_json.
        self.assertIn("question_id", self.esm_processed)
        self.assertIn("questionnaire_id", self.esm_processed)
        self.assertIn("esm_instructions", self.esm_processed)
        self.assertIn("esm_type", self.esm_processed)
        self.assertIn("time", self.esm_processed)
        # Check for explicitly added column.
        self.assertIn("datetime_lj", self.esm_processed)
        # All of these keys are referenced in other functions, so they are expected to be present in preprocessed ESM.
        # Since all of these are added in a single function, it should be OK to have many assert statements in one test.
    def test_classify_sessions_by_completion(self):
        self.esm_classified_sessions = classify_sessions_by_completion(