13 changed files with 47 additions and 838 deletions
--- a/config/environment.yml
+++ b/config/environment.yml
@ -16,7 +16,6 @@ dependencies:
  - python-dotenv
  - pytz
  - seaborn
-  - scikit-learn
  - sqlalchemy
  - statsmodels
  - tabulate
--- a/exploration/ex_ml_pipeline.py
+++ b/exploration/ex_ml_pipeline.py
@ -1,150 +0,0 @@
-# ---
-# jupyter:
-#   jupytext:
-#     formats: ipynb,py:percent
-#     text_representation:
-#       extension: .py
-#       format_name: percent
-#       format_version: '1.3'
-#       jupytext_version: 1.11.4
-#   kernelspec:
-#     display_name: straw2analysis
-#     language: python
-#     name: straw2analysis
-# ---
-
-# %%
-# %matplotlib inline
-import datetime
-import os
-import sys
-
-import seaborn as sns
-from sklearn import linear_model
-from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
-
-nb_dir = os.path.split(os.getcwd())[0]
-if nb_dir not in sys.path:
-    sys.path.append(nb_dir)
-
-# %%
-import participants.query_db
-from features import esm, helper, proximity
-
-# %% [markdown]
-# # 1. Get the relevant data
-
-# %%
-participants_inactive_usernames = participants.query_db.get_usernames(
-    collection_start=datetime.date.fromisoformat("2020-08-01")
-)
-# Consider only two participants to simplify.
-ptcp_2 = participants_inactive_usernames[0:2]
-
-# %% [markdown]
-# ## 1.1 Labels
-
-# %%
-df_esm = esm.get_esm_data(ptcp_2)
-df_esm_preprocessed = esm.preprocess_esm(df_esm)
-
-# %%
-df_esm_PANAS = df_esm_preprocessed[
-    (df_esm_preprocessed["questionnaire_id"] == 8)
-    | (df_esm_preprocessed["questionnaire_id"] == 9)
-]
-df_esm_PANAS_clean = esm.clean_up_esm(df_esm_PANAS)
-
-# %% [markdown]
-# ## 1.2 Sensor data
-
-# %%
-df_proximity = proximity.get_proximity_data(ptcp_2)
-df_proximity = helper.get_date_from_timestamp(df_proximity)
-df_proximity = proximity.recode_proximity(df_proximity)
-
-# %% [markdown]
-# ## 1.3 Standardization/personalization
-
-# %% [markdown]
-# # 2. Grouping/segmentation
-
-# %%
-df_esm_PANAS_daily_means = (
-    df_esm_PANAS_clean.groupby(["participant_id", "date_lj", "questionnaire_id"])
-    .esm_user_answer_numeric.agg("mean")
-    .reset_index()
-    .rename(columns={"esm_user_answer_numeric": "esm_numeric_mean"})
-)
-
-# %%
-df_esm_PANAS_daily_means = (
-    df_esm_PANAS_daily_means.pivot(
-        index=["participant_id", "date_lj"],
-        columns="questionnaire_id",
-        values="esm_numeric_mean",
-    )
-    .reset_index(col_level=1)
-    .rename(columns={8.0: "PA", 9.0: "NA"})
-    .set_index(["participant_id", "date_lj"])
-)
-
-
-# %%
-df_proximity_daily_counts = proximity.count_proximity(
-    df_proximity, ["participant_id", "date_lj"]
-)
-
-# %%
-df_proximity_daily_counts
-
-# %% [markdown]
-# # 3. Join features (and export to csv?)
-
-# %%
-df_full_data_daily_means = df_esm_PANAS_daily_means.join(
-    df_proximity_daily_counts
-).reset_index()
-
-# %% [markdown]
-# # 4. Machine learning model and parameters
-
-# %%
-lin_reg_proximity = linear_model.LinearRegression()
-
-# %% [markdown]
-# ## 4.1 Validation method
-
-# %%
-logo = LeaveOneGroupOut()
-logo.get_n_splits(
-    df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
-    df_full_data_daily_means["PA"],
-    groups=df_full_data_daily_means["participant_id"],
-)
-
-# %% [markdown]
-# ## 4.2 Fit results (export?)
-
-# %%
-cross_val_score(
-    lin_reg_proximity,
-    df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
-    df_full_data_daily_means["PA"],
-    groups=df_full_data_daily_means["participant_id"],
-    cv=logo,
-    n_jobs=-1,
-    scoring="r2",
-)
-
-# %%
-lin_reg_proximity.fit(
-    df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
-    df_full_data_daily_means["PA"],
-)
-
-# %%
-lin_reg_proximity.score(
-    df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
-    df_full_data_daily_means["PA"],
-)
--- a/exploration/expl_app_categories.py
+++ b/exploration/expl_app_categories.py
@ -1,76 +0,0 @@
-# ---
-# jupyter:
-#   jupytext:
-#     formats: ipynb,py:percent
-#     text_representation:
-#       extension: .py
-#       format_name: percent
-#       format_version: '1.3'
-#       jupytext_version: 1.11.4
-#   kernelspec:
-#     display_name: straw2analysis
-#     language: python
-#     name: straw2analysis
-# ---
-
-# %%
-# %matplotlib inline
-import os
-import sys
-
-import matplotlib.pyplot as plt
-import pandas as pd
-import seaborn as sns
-
-nb_dir = os.path.split(os.getcwd())[0]
-if nb_dir not in sys.path:
-    sys.path.append(nb_dir)
-
-# %%
-from config.models import AppCategories, Participant
-from setup import db_engine, session
-
-# %%
-query_app_categories = session.query(AppCategories)
-with db_engine.connect() as connection:
-    df_app_categories = pd.read_sql(query_app_categories.statement, connection)
-
-# %%
-df_app_categories.head()
-
-# %%
-df_app_categories["play_store_genre"].value_counts()
-
-# %%
-df_category_not_found = df_app_categories[
-    df_app_categories["play_store_genre"] == "not_found"
-]
-
-# %%
-df_category_not_found["play_store_response"].value_counts()
-
-# %%
-df_category_not_found["package_name"].value_counts()
-
-# %%
-manufacturers = [
-    "samsung",
-    "oneplus",
-    "huawei",
-    "xiaomi",
-    "lge",
-    "motorola",
-    "miui",
-    "lenovo",
-    "oppo",
-    "mediatek",
-]
-custom_rom = ["coloros", "lineageos", "myos", "cyanogenmod", "foundation.e"]
-other = ["android", "wssyncmldm"]
-rows_os_manufacturer = df_category_not_found["package_name"].str.contains(
-    "|".join(manufacturers + custom_rom + other), case=False
-)
-
-# %%
-with pd.option_context("display.max_rows", None, "display.max_columns", None):
-    display(df_category_not_found.loc[~rows_os_manufacturer])
--- a/exploration/expl_communication.py
+++ b/exploration/expl_communication.py
@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.11.4
+#       jupytext_version: 1.11.2
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -14,7 +14,6 @@
 # ---

 # %%
-# %matplotlib inline
 import os
 import sys

@ -54,15 +53,6 @@ import participants.query_db
 participants_inactive_usernames = participants.query_db.get_usernames()
 df_calls_inactive = get_call_data(participants_inactive_usernames)

-# %%
-participants_inactive_usernames
-
-# %%
-df_calls_inactive.head()
-
-# %%
-enumerate_contacts(df_calls_inactive).head()
-
 # %%
 df_calls_features = count_comms(df_calls_inactive)
 df_calls_features.head()
@ -80,9 +70,6 @@ calls_number = pd.wide_to_long(
    suffix="\D+",
 )

-# %%
-calls_number
-
 # %%
 sns.displot(calls_number, x="no", hue="call_type", binwidth=5, element="step", height=8)

@ -139,30 +126,3 @@ sms_number = pd.wide_to_long(
 sns.displot(
    sms_number, x="no", hue="message_type", binwidth=5, element="step", height=8
 )
-
-# %% [markdown]
-# # Communication features
-
-# %%
-df_calls_enumerated = enumerate_contacts(df_calls)
-display(df_calls_enumerated)
-
-# %%
-df_calls_contact_features = contact_features(df_calls_enumerated)
-display(df_calls_contact_features)
-
-# %%
-df_sms_enumerated = enumerate_contacts(df_sms)
-df_sms_contact_features = contact_features(df_sms_enumerated)
-display(df_sms_contact_features)
-
-# %%
-display(count_comms(df_calls))
-
-# %%
-display(count_comms(df_sms))
-
-# %%
-display(calls_sms_features(df_calls, df_sms))
-
-# %%
--- a/exploration/expl_esm_adherence.py
+++ b/exploration/expl_esm_adherence.py
@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.11.4
+#       jupytext_version: 1.11.2
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -14,7 +14,6 @@
 # ---

 # %%
-# %matplotlib inline
 import os
 import sys

--- a/features/communication.py
+++ b/features/communication.py
@ -86,8 +86,7 @@ def enumerate_contacts(comm_df: pd.DataFrame) -> pd.DataFrame:
    # In other words, recode the contacts into integers from 0 to n_contacts,
    # so that the first one is contacted the most often.
    contact_ids = (
-        # Group again for enumeration.
-        contact_counts.groupby("participant_id")
+        contact_counts.groupby("participant_id")  # Group again for enumeration.
        .cumcount()  # Enumerate (count) rows *within* participants.
        .to_frame("contact_id")
    )
@ -177,148 +176,15 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame:
    return comm_features


-def contact_features(df_enumerated: pd.DataFrame) -> pd.DataFrame:
-    """
-    Counts the number of people contacted (for each participant) and, if
-    df_enumerated is a dataframe containing calls data, the total duration
-    of calls between a participant and each of her contacts. 
-
-    Parameters
-    ----------
-    df_enumerated: pd.DataFrame
-        A dataframe of calls or SMSes; return of function enumerate_contacts.
-
-    Returns
-    -------
-    comm_df: pd.DataFrame
-        The altered dataframe with the column no_contacts and, if df_enumerated
-        contains calls data, an additional column total_call_duration.
-    """
-
-    # Check whether df contains calls or SMS data since some
-    # features we want to calculate are type-specyfic
-    if "call_duration" in df_enumerated:
-        # Add a column with the total duration of calls between two people
-        duration_count = (
-            df_enumerated.groupby(["participant_id", "contact_id"])
-            # For each participant and for each caller, sum durations of their calls
-            ["call_duration"]
-            .sum()
-            .reset_index()  # Make index (which is actually the participant id) a normal column
-            .rename(columns={"call_duration": "total_call_duration"})
-        )
-        # The new dataframe now contains columns containing information about
-        # participants, callers and the total duration of their calls. All that
-        # is now left to do is to merge the original df with the new one.
-        df_enumerated = df_enumerated.merge(
-            duration_count, on=["participant_id", "contact_id"]
-        )
-
-    contact_count = (
-        df_enumerated.groupby(["participant_id"])
-        .nunique()[
-            "contact_id"
-        ]  # For each participant, count the number of distinct contacts
-        .reset_index()  # Make index (which is actually the participant id) a normal column
-        .rename(columns={"contact_id": "no_contacts"})
-    )
-
-    df_enumerated = (
-        # Merge df with the newely created df containing info about number of contacts
-        df_enumerated.merge(contact_count, on="participant_id")
-        # Sort first by participant_id and then by contact_id and
-        # thereby restore the inital ordering of input dataframes.
-        .sort_values(["participant_id", "contact_id"])
-    )
-
-    # TODO:Determine work vs non-work contacts by work hours heuristics
-
-    return df_enumerated
+def contact_features():
+    # TODO Implement a method that takes a DF with enumerated contacts as argument and calculates:
+    # * Duration of calls per caller (for most common callers)
+    # * Determine work vs non-work contacts by work hours heuristics
+    # * Number of people contacted
+    # And similarly for SMS.
+    pass


-def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataFrame:
-    """
-    Calculates additional features relating calls and sms data.
-
-    Parameters
-    ----------
-    df_calls: pd.DataFrame
-        A dataframe of calls (return of get_call_data).
-    df_sms: pd.DataFrame
-        A dataframe of calls (return of get_sms_data).
-
-    Returns
-    -------
-    df_calls_sms: pd.DataFrame
-        The list of features relating calls and sms data for every participant.
-        These are:
-        * proportion_calls:
-            proportion of calls in total number of communications
-        * proportion_calls_incoming:
-            proportion of incoming calls in total number of incoming/received communications
-        * proportion_calls_outgoing:
-            proportion of outgoing calls in total number of outgoing/sent communications
-        * proportion_calls_missed_sms_received:
-            proportion of missed calls to the number of received messages
-        * proportion_calls_contacts:
-            proportion of calls contacts in total number of communication contacts
-    """
-
-    count_calls = count_comms(df_calls)
-    count_sms = count_comms(df_sms)
-
-    count_joined = (
-        count_calls.merge(
-            count_sms, on="participant_id", suffixes=("_calls", "_sms")
-        )  # Merge calls and sms features
-        .reset_index()  # Make participant_id a regular column
-        .assign(
-            proportion_calls=(
-                lambda x: x.no_all_calls / (x.no_all_calls + x.no_all_sms)
-            ),
-            proportion_calls_incoming=(
-                lambda x: x.no_incoming / (x.no_incoming + x.no_received)
-            ),
-            proportion_calls_missed_sms_received=(
-                lambda x: x.no_missed / (x.no_missed + x.no_received)
-            ),
-            proportion_calls_outgoing=(
-                lambda x: x.no_outgoing / (x.no_outgoing + x.no_sent)
-            )
-            # Calculate new features and create additional columns
-        )[
-            [
-                "participant_id",
-                "proportion_calls",
-                "proportion_calls_incoming",
-                "proportion_calls_outgoing",
-                "proportion_calls_missed_sms_received",
-            ]
-        ]  # Filter out only the relevant features
-    )
-
-    features_calls = contact_features(enumerate_contacts(df_calls))
-    features_sms = contact_features(enumerate_contacts(df_sms))
-
-    features_joined = (
-        features_calls.merge(
-            features_sms, on="participant_id", suffixes=("_calls", "_sms")
-        )  # Merge calls and sms features
-        .reset_index()  # Make participant_id a regular column
-        .assign(
-            proportion_calls_contacts=(
-                lambda x: x.no_contacts_calls
-                / (x.no_contacts_calls + x.no_contacts_sms)
-            )  # Calculate new features and create additional columns
-        )[
-            ["participant_id", "proportion_calls_contacts"]
-        ]  # Filter out only the relevant features
-        # Since we are interested only in some features and ignored
-        # others, a lot of duplicate rows were created. Remove them.
-        .drop_duplicates()
-    )
-
-    # Join the newly created dataframes
-    df_calls_sms = count_joined.merge(features_joined, on="participant_id")
-
-    return df_calls_sms
+def calls_sms_features():
+    # TODO Relate the calls and sms data, such as comparing the number of (missed) calls and messages.
+    pass
--- a/features/esm.py
+++ b/features/esm.py
@ -1,12 +1,14 @@
+import datetime
 from collections.abc import Collection

 import numpy as np
 import pandas as pd
+from pytz import timezone

 from config.models import ESM, Participant
-from features import helper
 from setup import db_engine, session

+TZ_LJ = timezone("Europe/Ljubljana")
 ESM_STATUS_ANSWERED = 2

 GROUP_SESSIONS_BY = ["participant_id", "device_id", "esm_session"]
@ -65,8 +67,14 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
    df_esm_preprocessed: pd.DataFrame
        A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
    """
-    df_esm = helper.get_date_from_timestamp(df_esm)
-
+    df_esm["datetime_lj"] = df_esm["double_esm_user_answer_timestamp"].apply(
+        lambda x: datetime.datetime.fromtimestamp(x / 1000.0, tz=TZ_LJ)
+    )
+    df_esm = df_esm.assign(
+        date_lj=lambda x: (x.datetime_lj - datetime.timedelta(hours=4)).dt.date
+    )
+    # Since daytime EMAs could *theoretically* last beyond midnight, but never after 4 AM,
+    # the datetime is first translated to 4 h earlier.
    df_esm_json = pd.json_normalize(df_esm["esm_json"]).drop(
        columns=["esm_trigger"]
    )  # The esm_trigger column is already present in the main df.
@ -248,9 +256,9 @@ def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
        ESM.ESM_TYPE.get("scale"),
        ESM.ESM_TYPE.get("number"),
    ]
-    df_esm_clean.loc[
+    df_esm_clean[df_esm_clean["esm_type"].isin(esm_type_numeric)] = df_esm_clean[
        df_esm_clean["esm_type"].isin(esm_type_numeric)
-    ] = df_esm_clean.loc[df_esm_clean["esm_type"].isin(esm_type_numeric)].assign(
+    ].assign(
        esm_user_answer_numeric=lambda x: x.esm_user_answer.str.slice(stop=1).astype(
            int
        )
--- a/features/esm_SAM.py
+++ b/features/esm_SAM.py
@ -1,267 +0,0 @@
-import numpy as np
-import pandas as pd
-
-import features.esm
-
-QUESTIONNAIRE_ID_SAM = {
-    "event_stress": 87,
-    "event_threat": 88,
-    "event_challenge": 89,
-    "event_time": 90,
-    "event_duration": 91,
-    "event_work_related": 92,
-    "period_stress": 93,
-}
-QUESTIONNAIRE_ID_SAM_LOW = min(QUESTIONNAIRE_ID_SAM.values())
-QUESTIONNAIRE_ID_SAM_HIGH = max(QUESTIONNAIRE_ID_SAM.values())
-
-GROUP_QUESTIONNAIRES_BY = [
-    "participant_id",
-    "device_id",
-    "esm_session",
-]
-# Each questionnaire occurs only once within each esm_session on the same device within the same participant.
-
-
-def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:
-    # 0. Select only questions from Stress Appraisal Measure.
-    df_esm_preprocessed = features.esm.preprocess_esm(df_esm)
-    df_esm_sam = df_esm_preprocessed[
-        (df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_ID_SAM_LOW)
-        & (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_ID_SAM_HIGH)
-    ]
-
-    df_esm_sam_clean = features.esm.clean_up_esm(df_esm_sam)
-    # 1.
-    df_esm_event_threat_challenge_mean_wide = calculate_threat_challenge_means(
-        df_esm_sam_clean
-    )
-    # 2.
-    df_esm_event_stress = detect_stressful_event(df_esm_sam_clean)
-
-    # Join to the previously calculated features related to the events.
-    df_esm_events = df_esm_event_threat_challenge_mean_wide.join(
-        df_esm_event_stress[
-            GROUP_QUESTIONNAIRES_BY + ["event_present", "event_stressfulness"]
-        ].set_index(GROUP_QUESTIONNAIRES_BY)
-    )
-
-    # 3.
-    df_esm_event_work_related = detect_event_work_related(df_esm_sam_clean)
-
-    df_esm_events = df_esm_events.join(
-        df_esm_event_work_related[
-            GROUP_QUESTIONNAIRES_BY + ["event_work_related"]
-        ].set_index(GROUP_QUESTIONNAIRES_BY)
-    )
-
-    # 4.
-    df_esm_event_time = convert_event_time(df_esm_sam_clean)
-
-    df_esm_events = df_esm_events.join(
-        df_esm_event_time[GROUP_QUESTIONNAIRES_BY + ["event_time"]].set_index(
-            GROUP_QUESTIONNAIRES_BY
-        )
-    )
-
-    # 5.
-    df_esm_event_duration = extract_event_duration(df_esm_sam_clean)
-
-    df_esm_events = df_esm_events.join(
-        df_esm_event_duration[
-            GROUP_QUESTIONNAIRES_BY + ["event_duration", "event_duration_info"]
-        ].set_index(GROUP_QUESTIONNAIRES_BY)
-    )
-
-    return df_esm_events
-
-
-def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
-    """
-    This function calculates challenge and threat (two Stress Appraisal Measure subscales) means,
-        for each ESM session (within participants and devices).
-    It creates a grouped dataframe with means in two columns.
-
-    Parameters
-    ----------
-    df_esm_sam_clean: pd.DataFrame
-        A cleaned up dataframe of Stress Appraisal Measure items.
-
-    Returns
-    -------
-    df_esm_event_threat_challenge_mean_wide: pd.DataFrame
-        A dataframe of unique ESM sessions (by participants and devices) with threat and challenge means.
-    """
-    # Select only threat and challenge assessments for events
-    df_esm_event_threat_challenge = df_esm_sam_clean[
-        (
-            df_esm_sam_clean["questionnaire_id"]
-            == QUESTIONNAIRE_ID_SAM.get("event_threat")
-        )
-        | (
-            df_esm_sam_clean["questionnaire_id"]
-            == QUESTIONNAIRE_ID_SAM.get("event_challenge")
-        )
-    ]
-    # Calculate mean of threat and challenge subscales for each ESM session.
-    df_esm_event_threat_challenge_mean_wide = pd.pivot_table(
-        df_esm_event_threat_challenge,
-        index=["participant_id", "device_id", "esm_session"],
-        columns=["questionnaire_id"],
-        values=["esm_user_answer_numeric"],
-        aggfunc="mean",
-    )
-    # Drop unnecessary column values.
-    df_esm_event_threat_challenge_mean_wide.columns = df_esm_event_threat_challenge_mean_wide.columns.get_level_values(
-        1
-    )
-    df_esm_event_threat_challenge_mean_wide.columns.name = None
-    df_esm_event_threat_challenge_mean_wide.rename(
-        columns={
-            QUESTIONNAIRE_ID_SAM.get("event_threat"): "threat_mean",
-            QUESTIONNAIRE_ID_SAM.get("event_challenge"): "challenge_mean",
-        },
-        inplace=True,
-    )
-    return df_esm_event_threat_challenge_mean_wide
-
-
-def detect_stressful_event(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
-    """
-    Participants were asked: "Was there a particular event that created tension in you?"
-    The following options were available:
-        0 - No,
-        1 - Yes, slightly,
-        2 - Yes, moderately,
-        3 - Yes, considerably,
-        4 - Yes, extremely.
-    This function indicates whether there was a stressful event (True/False)
-        and how stressful it was on a scale of 1 to 4.
-
-    Parameters
-    ----------
-    df_esm_sam_clean: pd.DataFrame
-        A cleaned up dataframe of Stress Appraisal Measure items.
-
-    Returns
-    -------
-    df_esm_event_stress: pd.DataFrame
-        The same dataframe with two new columns:
-            - event_present, indicating whether there was a stressful event at all,
-            - event_stressfulness, a numeric answer (1-4) to the single item question.
-
-    """
-    df_esm_event_stress = df_esm_sam_clean[
-        df_esm_sam_clean["questionnaire_id"] == QUESTIONNAIRE_ID_SAM.get("event_stress")
-    ]
-    df_esm_event_stress = df_esm_event_stress.assign(
-        event_present=lambda x: x.esm_user_answer_numeric > 0,
-        event_stressfulness=lambda x: x.esm_user_answer_numeric,
-    )
-    return df_esm_event_stress
-
-
-def detect_event_work_related(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
-    """
-    This function simply adds a column indicating the answer to the question:
-        "Was/is this event work-related?"
-
-    Parameters
-    ----------
-    df_esm_sam_clean: pd.DataFrame
-        A cleaned up dataframe of Stress Appraisal Measure items.
-
-    Returns
-    -------
-    df_esm_event_stress: pd.DataFrame
-        The same dataframe with a new column event_work_related (True/False).
-
-    """
-    df_esm_event_stress = df_esm_sam_clean[
-        df_esm_sam_clean["questionnaire_id"]
-        == QUESTIONNAIRE_ID_SAM.get("event_work_related")
-    ]
-    df_esm_event_stress = df_esm_event_stress.assign(
-        event_work_related=lambda x: x.esm_user_answer_numeric > 0
-    )
-    return df_esm_event_stress
-
-
-def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
-    """
-    This function only serves to convert the string datetime answer into a real datetime type.
-    Errors during this conversion are coerced, meaning that non-datetime answers are assigned Not a Time (NaT).
-    NOTE: Since the only available non-datetime answer to this question was "0 - I do not remember",
-        the NaTs can be interpreted to mean this.
-
-    Parameters
-    ----------
-    df_esm_sam_clean: pd.DataFrame
-        A cleaned up dataframe of Stress Appraisal Measure items.
-
-    Returns
-    -------
-    df_esm_event_time: pd.DataFrame
-        The same dataframe with a new column event_time of datetime type.
-    """
-    df_esm_event_time = df_esm_sam_clean[
-        df_esm_sam_clean["questionnaire_id"] == QUESTIONNAIRE_ID_SAM.get("event_time")
-    ].assign(
-        event_time=lambda x: pd.to_datetime(
-            x.esm_user_answer, errors="coerce", infer_datetime_format=True, exact=True
-        )
-    )
-    return df_esm_event_time
-
-
-def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
-    """
-    If participants indicated a stressful events, they were asked:
-        "How long did this event last? (Answer in hours and minutes)"
-    This function extracts this duration time and saves additional answers:
-        0 - I do not remember,
-        1 - It is still going on.
-
-    Parameters
-    ----------
-    df_esm_sam_clean: pd.DataFrame
-        A cleaned up dataframe of Stress Appraisal Measure items.
-
-    Returns
-    -------
-    df_esm_event_duration: pd.DataFrame
-        The same dataframe with two new columns:
-            - event_duration, a time part of a datetime,
-            - event_duration_info, giving other options to this question:
-                0 - I do not remember,
-                1 - It is still going on
-    """
-    df_esm_event_duration = df_esm_sam_clean[
-        df_esm_sam_clean["questionnaire_id"]
-        == QUESTIONNAIRE_ID_SAM.get("event_duration")
-    ].assign(
-        event_duration=lambda x: pd.to_datetime(
-            x.esm_user_answer.str.slice(start=0, stop=-6), errors="coerce"
-        ).dt.time
-    )
-    # TODO Explore the values recorded in event_duration and possibly fix mistakes.
-    # For example, participants reported setting 23:50:00 instead of 00:50:00.
-
-    # For the events that no duration was found (i.e. event_duration = NaT),
-    # we can determine whether:
-    #   - this event is still going on ("1 - It is still going on")
-    #   - the participant couldn't remember it's duration ("0 - I do not remember")
-    # Generally, these answers were converted to esm_user_answer_numeric in clean_up_esm,
-    # but only the numeric types of questions and answers.
-    # Since this was of "datetime" type, convert these specific answers here again.
-    df_esm_event_duration["event_duration_info"] = np.nan
-    df_esm_event_duration[
-        df_esm_event_duration.event_duration.isna()
-    ] = df_esm_event_duration[df_esm_event_duration.event_duration.isna()].assign(
-        event_duration_info=lambda x: x.esm_user_answer.str.slice(stop=1).astype(int)
-    )
-
-    return df_esm_event_duration
-
-
-# TODO: How many questions about the stressfulness of the period were asked and how does this relate to events?
--- a/features/helper.py
+++ b/features/helper.py
@ -1,41 +0,0 @@
-import datetime
-
-import pandas as pd
-from pytz import timezone
-
-TZ_LJ = timezone("Europe/Ljubljana")
-COLUMN_TIMESTAMP = "timestamp"
-COLUMN_TIMESTAMP_ESM = "double_esm_user_answer_timestamp"
-
-
-def get_date_from_timestamp(df_aware) -> pd.DataFrame:
-    """
-    Transform a UNIX timestamp into a datetime (with Ljubljana timezone).
-    Additionally, extract only the date part, where anything until 4 AM is considered the same day.
-
-    Parameters
-    ----------
-    df_aware: pd.DataFrame
-        Any AWARE-type data as defined in models.py.
-
-    Returns
-    -------
-    df_aware: pd.DataFrame
-        The same dataframe with datetime_lj and date_lj columns added.
-
-    """
-    if COLUMN_TIMESTAMP_ESM in df_aware:
-        column_timestamp = COLUMN_TIMESTAMP_ESM
-    else:
-        column_timestamp = COLUMN_TIMESTAMP
-
-    df_aware["datetime_lj"] = df_aware[column_timestamp].apply(
-        lambda x: datetime.datetime.fromtimestamp(x / 1000.0, tz=TZ_LJ)
-    )
-    df_aware = df_aware.assign(
-        date_lj=lambda x: (x.datetime_lj - datetime.timedelta(hours=4)).dt.date
-    )
-    # Since daytime EMAs could *theoretically* last beyond midnight, but never after 4 AM,
-    # the datetime is first translated to 4 h earlier.
-
-    return df_aware
--- a/features/proximity.py
+++ b/features/proximity.py
@ -28,63 +28,3 @@ def get_proximity_data(usernames: Collection) -> pd.DataFrame:
    with db_engine.connect() as connection:
        df_proximity = pd.read_sql(query_proximity.statement, connection)
    return df_proximity
-
-
-def recode_proximity(df_proximity: pd.DataFrame) -> pd.DataFrame:
-    """
-    This function recodes proximity from a double to a boolean value.
-    Different proximity sensors report different values,
-        but in our data only several distinct values have ever been found.
-    These are therefore converted into "near" and "far" binary values.
-    See expl_proximity.ipynb for additional info.
-
-    Parameters
-    ----------
-    df_proximity: pd.DataFrame
-        A dataframe of proximity data.
-
-    Returns
-    -------
-    df_proximity: pd.DataFrame
-        The same dataframe with an additional column bool_prox_near,
-            indicating whether "near" proximity was reported.
-        False values correspond to "far" reported by this sensor.
-
-    """
-    df_proximity = df_proximity.assign(bool_prox_near=lambda x: x.double_proximity == 0)
-    return df_proximity
-
-
-def count_proximity(
-    df_proximity: pd.DataFrame, group_by: Collection = ["participant_id"]
-) -> pd.DataFrame:
-    """
-    The function counts how many times a "near" value occurs in proximity
-        and calculates the proportion of this counts to all proximity values (i.e. relative count).
-
-    Parameters
-    ----------
-    df_proximity: pd.DataFrame
-        A dataframe of proximity data.
-    group_by: Collection
-        A list of strings, specifying by which parameters to group.
-        By default, the features are calculated per participant, but could be "date_lj" etc.
-
-    Returns
-    -------
-    df_proximity_features: pd.DataFrame
-        A dataframe with the count of "near" proximity values and their relative count.
-    """
-    if "bool_prox_near" not in df_proximity:
-        df_proximity = recode_proximity(df_proximity)
-    df_proximity["bool_prox_far"] = ~df_proximity["bool_prox_near"]
-    df_proximity_features = df_proximity.groupby(group_by).sum()[
-        ["bool_prox_near", "bool_prox_far"]
-    ]
-    df_proximity_features = df_proximity_features.assign(
-        prop_prox_near=lambda x: x.bool_prox_near / (x.bool_prox_near + x.bool_prox_far)
-    )
-    df_proximity_features = df_proximity_features.rename(
-        columns={"bool_prox_near": "freq_prox_near"}
-    ).drop(columns="bool_prox_far", inplace=False)
-    return df_proximity_features
--- a/machine_learning/init.py
+++ b/machine_learning/init.py
--- a/statistical_analysis/adherence.py
+++ b/statistical_analysis/adherence.py
@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.11.4
+#       jupytext_version: 1.11.2
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -14,12 +14,12 @@
 # ---

 # %%
-# %matplotlib inline
 import datetime
+
+# %%
 import os
 import sys

-import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 import statsmodels.api as sm
@ -31,24 +31,6 @@ if nb_dir not in sys.path:
 import participants.query_db
 from features.esm import *

-# %%
-SAVE_FIGS = True
-FIG_HEIGHT = 5
-FIG_ASPECT = 1.7
-FIG_COLOUR = "#28827C"
-
-SMALL_SIZE = 14
-MEDIUM_SIZE = SMALL_SIZE + 2
-BIGGER_SIZE = MEDIUM_SIZE + 2
-
-plt.rc("font", size=SMALL_SIZE)  # controls default text sizes
-plt.rc("axes", titlesize=SMALL_SIZE)  # fontsize of the axes title
-plt.rc("axes", labelsize=MEDIUM_SIZE)  # fontsize of the x and y labels
-plt.rc("xtick", labelsize=SMALL_SIZE)  # fontsize of the tick labels
-plt.rc("ytick", labelsize=SMALL_SIZE)  # fontsize of the tick labels
-plt.rc("legend", fontsize=SMALL_SIZE)  # legend fontsize
-plt.rc("figure", titlesize=BIGGER_SIZE)  # fontsize of the figure title
-
 # %%
 baseline_si = pd.read_csv("E:/STRAWbaseline/results-survey637813.csv")
 baseline_be_1 = pd.read_csv("E:/STRAWbaseline/results-survey358134.csv")
@ -148,7 +130,7 @@ df_adherence.describe()
 df_adherence[["gender", "startlanguage"]].value_counts()

 # %%
-sns.displot(df_adherence["finished_sessions"], binwidth=5, height=FIG_HEIGHT)
+sns.displot(df_adherence["finished_sessions"], binwidth=5, height=5)

 # %%
 lm_adherence = smf.ols(
@ -242,14 +224,12 @@ df_session_workday = df_session_workday.assign(
 g1 = sns.displot(
    df_session_workday["time_diff_minutes"],
    binwidth=5,
-    height=FIG_HEIGHT,
-    aspect=FIG_ASPECT,
-    color=FIG_COLOUR,
+    height=5,
+    aspect=1.5,
+    color="#28827C",
 )
 g1.set_axis_labels("Time difference [min]", "Session count")
-g1.set(xlim=(0, 570))
-if SAVE_FIGS:
-    g1.savefig("WorkdayEMAtimeDiff.pdf")
+# g1.savefig("WorkdayEMAtimeDiff.pdf")

 # %% [markdown]
 # There are some sessions that are really close together. By design, none should be closer than 30 min. Let's take a look at those.
@ -316,13 +296,12 @@ df_mean_daytime_interval.describe()
 g2 = sns.displot(
    df_mean_daytime_interval.time_diff_minutes,
    binwidth=5,
-    height=FIG_HEIGHT,
-    aspect=FIG_ASPECT,
-    color=FIG_COLOUR,
+    height=5,
+    aspect=1.5,
+    color="#28827C",
 )
 g2.set_axis_labels("Median time difference [min]", "Participant count")
-if SAVE_FIGS:
-    g2.savefig("WorkdayEMAtimeDiffMedianParticip.pdf")
+# g2.savefig("WorkdayEMAtimeDiffMedianParticip.pdf")

 # %%
 df_adherence = df_adherence.merge(
@ -348,9 +327,9 @@ df_count_daytime_per_participant["time"].describe()
 sns.displot(
    df_count_daytime_per_participant.time,
    binwidth=1,
-    height=FIG_HEIGHT,
-    aspect=FIG_ASPECT,
-    color=FIG_COLOUR,
+    height=5,
+    aspect=1.5,
+    color="#28827C",
 )

 # %% [markdown]
@ -385,14 +364,13 @@ s_evening_completed_ratio.describe()
 g3 = sns.displot(
    s_evening_completed_ratio - 0.001,
    binwidth=0.05,
-    height=FIG_HEIGHT,
-    aspect=FIG_ASPECT,
-    color=FIG_COLOUR,
+    height=5,
+    aspect=1.5,
+    color="#28827C",
 )
 g3.set_axis_labels("Ratio of days with the evening EMA filled out", "Participant count")
 g3.set(xlim=(1.01, 0.59))
-if SAVE_FIGS:
-    g3.savefig("EveningEMAratioParticip.pdf")
+# g3.savefig("EveningEMAratioParticip.pdf")

 # %%
 df_adherence = df_adherence.merge(
@ -408,3 +386,5 @@ lr_ols_evening_ratio = smf.ols(
 )
 ls_result_evening_ratio = lr_ols_evening_ratio.fit()
 ls_result_evening_ratio.summary()
+
+# %%
--- a/test/test_esm.py
+++ b/test/test_esm.py
@ -16,16 +16,7 @@ class EsmFeatures(unittest.TestCase):

    def test_preprocess_esm(self):
        self.esm_processed = preprocess_esm(self.esm)
-        # Check for columns which should have been extracted from esm_json.
        self.assertIn("question_id", self.esm_processed)
-        self.assertIn("questionnaire_id", self.esm_processed)
-        self.assertIn("esm_instructions", self.esm_processed)
-        self.assertIn("esm_type", self.esm_processed)
-        self.assertIn("time", self.esm_processed)
-        # Check for explicitly added column.
-        self.assertIn("datetime_lj", self.esm_processed)
-        # All of these keys are referenced in other functions, so they are expected to be present in preprocessed ESM.
-        # Since all of these are added in a single function, it should be OK to have many assert statements in one test.

    def test_classify_sessions_by_completion(self):
        self.esm_classified_sessions = classify_sessions_by_completion(