Add an example for linear regression.

Add a parameter for grouping.
Use the same function for ESM and other data.
2021-08-12 16:54:00 +02:00 · 2021-08-12 15:07:20 +02:00 · 2021-08-11 17:26:44 +02:00 · 2021-08-11 17:19:14 +02:00 · 2021-08-11 16:42:30 +02:00 · 2021-08-11 16:40:19 +02:00
13 changed files with 838 additions and 47 deletions
--- a/config/environment.yml
+++ b/config/environment.yml
@ -16,6 +16,7 @@ dependencies:
  - python-dotenv
  - pytz
  - seaborn
+  - scikit-learn
  - sqlalchemy
  - statsmodels
  - tabulate
--- a/exploration/ex_ml_pipeline.py
+++ b/exploration/ex_ml_pipeline.py
@ -0,0 +1,150 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.11.4
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %%
+# %matplotlib inline
+import datetime
+import os
+import sys
+
+import seaborn as sns
+from sklearn import linear_model
+from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+# %%
+import participants.query_db
+from features import esm, helper, proximity
+
+# %% [markdown]
+# # 1. Get the relevant data
+
+# %%
+participants_inactive_usernames = participants.query_db.get_usernames(
+    collection_start=datetime.date.fromisoformat("2020-08-01")
+)
+# Consider only two participants to simplify.
+ptcp_2 = participants_inactive_usernames[0:2]
+
+# %% [markdown]
+# ## 1.1 Labels
+
+# %%
+df_esm = esm.get_esm_data(ptcp_2)
+df_esm_preprocessed = esm.preprocess_esm(df_esm)
+
+# %%
+df_esm_PANAS = df_esm_preprocessed[
+    (df_esm_preprocessed["questionnaire_id"] == 8)
+    | (df_esm_preprocessed["questionnaire_id"] == 9)
+]
+df_esm_PANAS_clean = esm.clean_up_esm(df_esm_PANAS)
+
+# %% [markdown]
+# ## 1.2 Sensor data
+
+# %%
+df_proximity = proximity.get_proximity_data(ptcp_2)
+df_proximity = helper.get_date_from_timestamp(df_proximity)
+df_proximity = proximity.recode_proximity(df_proximity)
+
+# %% [markdown]
+# ## 1.3 Standardization/personalization
+
+# %% [markdown]
+# # 2. Grouping/segmentation
+
+# %%
+df_esm_PANAS_daily_means = (
+    df_esm_PANAS_clean.groupby(["participant_id", "date_lj", "questionnaire_id"])
+    .esm_user_answer_numeric.agg("mean")
+    .reset_index()
+    .rename(columns={"esm_user_answer_numeric": "esm_numeric_mean"})
+)
+
+# %%
+df_esm_PANAS_daily_means = (
+    df_esm_PANAS_daily_means.pivot(
+        index=["participant_id", "date_lj"],
+        columns="questionnaire_id",
+        values="esm_numeric_mean",
+    )
+    .reset_index(col_level=1)
+    .rename(columns={8.0: "PA", 9.0: "NA"})
+    .set_index(["participant_id", "date_lj"])
+)
+
+
+# %%
+df_proximity_daily_counts = proximity.count_proximity(
+    df_proximity, ["participant_id", "date_lj"]
+)
+
+# %%
+df_proximity_daily_counts
+
+# %% [markdown]
+# # 3. Join features (and export to csv?)
+
+# %%
+df_full_data_daily_means = df_esm_PANAS_daily_means.join(
+    df_proximity_daily_counts
+).reset_index()
+
+# %% [markdown]
+# # 4. Machine learning model and parameters
+
+# %%
+lin_reg_proximity = linear_model.LinearRegression()
+
+# %% [markdown]
+# ## 4.1 Validation method
+
+# %%
+logo = LeaveOneGroupOut()
+logo.get_n_splits(
+    df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
+    df_full_data_daily_means["PA"],
+    groups=df_full_data_daily_means["participant_id"],
+)
+
+# %% [markdown]
+# ## 4.2 Fit results (export?)
+
+# %%
+cross_val_score(
+    lin_reg_proximity,
+    df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
+    df_full_data_daily_means["PA"],
+    groups=df_full_data_daily_means["participant_id"],
+    cv=logo,
+    n_jobs=-1,
+    scoring="r2",
+)
+
+# %%
+lin_reg_proximity.fit(
+    df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
+    df_full_data_daily_means["PA"],
+)
+
+# %%
+lin_reg_proximity.score(
+    df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
+    df_full_data_daily_means["PA"],
+)
--- a/exploration/expl_app_categories.py
+++ b/exploration/expl_app_categories.py
@ -0,0 +1,76 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.11.4
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %%
+# %matplotlib inline
+import os
+import sys
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+# %%
+from config.models import AppCategories, Participant
+from setup import db_engine, session
+
+# %%
+query_app_categories = session.query(AppCategories)
+with db_engine.connect() as connection:
+    df_app_categories = pd.read_sql(query_app_categories.statement, connection)
+
+# %%
+df_app_categories.head()
+
+# %%
+df_app_categories["play_store_genre"].value_counts()
+
+# %%
+df_category_not_found = df_app_categories[
+    df_app_categories["play_store_genre"] == "not_found"
+]
+
+# %%
+df_category_not_found["play_store_response"].value_counts()
+
+# %%
+df_category_not_found["package_name"].value_counts()
+
+# %%
+manufacturers = [
+    "samsung",
+    "oneplus",
+    "huawei",
+    "xiaomi",
+    "lge",
+    "motorola",
+    "miui",
+    "lenovo",
+    "oppo",
+    "mediatek",
+]
+custom_rom = ["coloros", "lineageos", "myos", "cyanogenmod", "foundation.e"]
+other = ["android", "wssyncmldm"]
+rows_os_manufacturer = df_category_not_found["package_name"].str.contains(
+    "|".join(manufacturers + custom_rom + other), case=False
+)
+
+# %%
+with pd.option_context("display.max_rows", None, "display.max_columns", None):
+    display(df_category_not_found.loc[~rows_os_manufacturer])
--- a/exploration/expl_communication.py
+++ b/exploration/expl_communication.py
@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.11.2
+#       jupytext_version: 1.11.4
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -14,6 +14,7 @@
 # ---

 # %%
+# %matplotlib inline
 import os
 import sys

@ -53,6 +54,15 @@ import participants.query_db
 participants_inactive_usernames = participants.query_db.get_usernames()
 df_calls_inactive = get_call_data(participants_inactive_usernames)

+# %%
+participants_inactive_usernames
+
+# %%
+df_calls_inactive.head()
+
+# %%
+enumerate_contacts(df_calls_inactive).head()
+
 # %%
 df_calls_features = count_comms(df_calls_inactive)
 df_calls_features.head()
@ -70,6 +80,9 @@ calls_number = pd.wide_to_long(
    suffix="\D+",
 )

+# %%
+calls_number
+
 # %%
 sns.displot(calls_number, x="no", hue="call_type", binwidth=5, element="step", height=8)

@ -126,3 +139,30 @@ sms_number = pd.wide_to_long(
 sns.displot(
    sms_number, x="no", hue="message_type", binwidth=5, element="step", height=8
 )
+
+# %% [markdown]
+# # Communication features
+
+# %%
+df_calls_enumerated = enumerate_contacts(df_calls)
+display(df_calls_enumerated)
+
+# %%
+df_calls_contact_features = contact_features(df_calls_enumerated)
+display(df_calls_contact_features)
+
+# %%
+df_sms_enumerated = enumerate_contacts(df_sms)
+df_sms_contact_features = contact_features(df_sms_enumerated)
+display(df_sms_contact_features)
+
+# %%
+display(count_comms(df_calls))
+
+# %%
+display(count_comms(df_sms))
+
+# %%
+display(calls_sms_features(df_calls, df_sms))
+
+# %%
--- a/exploration/expl_esm_adherence.py
+++ b/exploration/expl_esm_adherence.py
@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.11.2
+#       jupytext_version: 1.11.4
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -14,6 +14,7 @@
 # ---

 # %%
+# %matplotlib inline
 import os
 import sys

--- a/features/communication.py
+++ b/features/communication.py
@ -86,7 +86,8 @@ def enumerate_contacts(comm_df: pd.DataFrame) -> pd.DataFrame:
    # In other words, recode the contacts into integers from 0 to n_contacts,
    # so that the first one is contacted the most often.
    contact_ids = (
-        contact_counts.groupby("participant_id")  # Group again for enumeration.
+        # Group again for enumeration.
+        contact_counts.groupby("participant_id")
        .cumcount()  # Enumerate (count) rows *within* participants.
        .to_frame("contact_id")
    )
@ -176,15 +177,148 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame:
    return comm_features


-def contact_features():
-    # TODO Implement a method that takes a DF with enumerated contacts as argument and calculates:
-    # * Duration of calls per caller (for most common callers)
-    # * Determine work vs non-work contacts by work hours heuristics
-    # * Number of people contacted
-    # And similarly for SMS.
-    pass
+def contact_features(df_enumerated: pd.DataFrame) -> pd.DataFrame:
+    """
+    Counts the number of people contacted (for each participant) and, if
+    df_enumerated is a dataframe containing calls data, the total duration
+    of calls between a participant and each of her contacts. 
+
+    Parameters
+    ----------
+    df_enumerated: pd.DataFrame
+        A dataframe of calls or SMSes; return of function enumerate_contacts.
+
+    Returns
+    -------
+    comm_df: pd.DataFrame
+        The altered dataframe with the column no_contacts and, if df_enumerated
+        contains calls data, an additional column total_call_duration.
+    """
+
+    # Check whether df contains calls or SMS data since some
+    # features we want to calculate are type-specyfic
+    if "call_duration" in df_enumerated:
+        # Add a column with the total duration of calls between two people
+        duration_count = (
+            df_enumerated.groupby(["participant_id", "contact_id"])
+            # For each participant and for each caller, sum durations of their calls
+            ["call_duration"]
+            .sum()
+            .reset_index()  # Make index (which is actually the participant id) a normal column
+            .rename(columns={"call_duration": "total_call_duration"})
+        )
+        # The new dataframe now contains columns containing information about
+        # participants, callers and the total duration of their calls. All that
+        # is now left to do is to merge the original df with the new one.
+        df_enumerated = df_enumerated.merge(
+            duration_count, on=["participant_id", "contact_id"]
+        )
+
+    contact_count = (
+        df_enumerated.groupby(["participant_id"])
+        .nunique()[
+            "contact_id"
+        ]  # For each participant, count the number of distinct contacts
+        .reset_index()  # Make index (which is actually the participant id) a normal column
+        .rename(columns={"contact_id": "no_contacts"})
+    )
+
+    df_enumerated = (
+        # Merge df with the newely created df containing info about number of contacts
+        df_enumerated.merge(contact_count, on="participant_id")
+        # Sort first by participant_id and then by contact_id and
+        # thereby restore the inital ordering of input dataframes.
+        .sort_values(["participant_id", "contact_id"])
+    )
+
+    # TODO:Determine work vs non-work contacts by work hours heuristics
+
+    return df_enumerated


-def calls_sms_features():
-    # TODO Relate the calls and sms data, such as comparing the number of (missed) calls and messages.
-    pass
+def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataFrame:
+    """
+    Calculates additional features relating calls and sms data.
+
+    Parameters
+    ----------
+    df_calls: pd.DataFrame
+        A dataframe of calls (return of get_call_data).
+    df_sms: pd.DataFrame
+        A dataframe of calls (return of get_sms_data).
+
+    Returns
+    -------
+    df_calls_sms: pd.DataFrame
+        The list of features relating calls and sms data for every participant.
+        These are:
+        * proportion_calls:
+            proportion of calls in total number of communications
+        * proportion_calls_incoming:
+            proportion of incoming calls in total number of incoming/received communications
+        * proportion_calls_outgoing:
+            proportion of outgoing calls in total number of outgoing/sent communications
+        * proportion_calls_missed_sms_received:
+            proportion of missed calls to the number of received messages
+        * proportion_calls_contacts:
+            proportion of calls contacts in total number of communication contacts
+    """
+
+    count_calls = count_comms(df_calls)
+    count_sms = count_comms(df_sms)
+
+    count_joined = (
+        count_calls.merge(
+            count_sms, on="participant_id", suffixes=("_calls", "_sms")
+        )  # Merge calls and sms features
+        .reset_index()  # Make participant_id a regular column
+        .assign(
+            proportion_calls=(
+                lambda x: x.no_all_calls / (x.no_all_calls + x.no_all_sms)
+            ),
+            proportion_calls_incoming=(
+                lambda x: x.no_incoming / (x.no_incoming + x.no_received)
+            ),
+            proportion_calls_missed_sms_received=(
+                lambda x: x.no_missed / (x.no_missed + x.no_received)
+            ),
+            proportion_calls_outgoing=(
+                lambda x: x.no_outgoing / (x.no_outgoing + x.no_sent)
+            )
+            # Calculate new features and create additional columns
+        )[
+            [
+                "participant_id",
+                "proportion_calls",
+                "proportion_calls_incoming",
+                "proportion_calls_outgoing",
+                "proportion_calls_missed_sms_received",
+            ]
+        ]  # Filter out only the relevant features
+    )
+
+    features_calls = contact_features(enumerate_contacts(df_calls))
+    features_sms = contact_features(enumerate_contacts(df_sms))
+
+    features_joined = (
+        features_calls.merge(
+            features_sms, on="participant_id", suffixes=("_calls", "_sms")
+        )  # Merge calls and sms features
+        .reset_index()  # Make participant_id a regular column
+        .assign(
+            proportion_calls_contacts=(
+                lambda x: x.no_contacts_calls
+                / (x.no_contacts_calls + x.no_contacts_sms)
+            )  # Calculate new features and create additional columns
+        )[
+            ["participant_id", "proportion_calls_contacts"]
+        ]  # Filter out only the relevant features
+        # Since we are interested only in some features and ignored
+        # others, a lot of duplicate rows were created. Remove them.
+        .drop_duplicates()
+    )
+
+    # Join the newly created dataframes
+    df_calls_sms = count_joined.merge(features_joined, on="participant_id")
+
+    return df_calls_sms
--- a/features/esm.py
+++ b/features/esm.py
@ -1,14 +1,12 @@
-import datetime
 from collections.abc import Collection

 import numpy as np
 import pandas as pd
-from pytz import timezone

 from config.models import ESM, Participant
+from features import helper
 from setup import db_engine, session

-TZ_LJ = timezone("Europe/Ljubljana")
 ESM_STATUS_ANSWERED = 2

 GROUP_SESSIONS_BY = ["participant_id", "device_id", "esm_session"]
@ -67,14 +65,8 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
    df_esm_preprocessed: pd.DataFrame
        A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
    """
-    df_esm["datetime_lj"] = df_esm["double_esm_user_answer_timestamp"].apply(
-        lambda x: datetime.datetime.fromtimestamp(x / 1000.0, tz=TZ_LJ)
-    )
-    df_esm = df_esm.assign(
-        date_lj=lambda x: (x.datetime_lj - datetime.timedelta(hours=4)).dt.date
-    )
-    # Since daytime EMAs could *theoretically* last beyond midnight, but never after 4 AM,
-    # the datetime is first translated to 4 h earlier.
+    df_esm = helper.get_date_from_timestamp(df_esm)
+
    df_esm_json = pd.json_normalize(df_esm["esm_json"]).drop(
        columns=["esm_trigger"]
    )  # The esm_trigger column is already present in the main df.
@ -256,9 +248,9 @@ def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
        ESM.ESM_TYPE.get("scale"),
        ESM.ESM_TYPE.get("number"),
    ]
-    df_esm_clean[df_esm_clean["esm_type"].isin(esm_type_numeric)] = df_esm_clean[
+    df_esm_clean.loc[
        df_esm_clean["esm_type"].isin(esm_type_numeric)
-    ].assign(
+    ] = df_esm_clean.loc[df_esm_clean["esm_type"].isin(esm_type_numeric)].assign(
        esm_user_answer_numeric=lambda x: x.esm_user_answer.str.slice(stop=1).astype(
            int
        )
--- a/features/esm_SAM.py
+++ b/features/esm_SAM.py
@ -0,0 +1,267 @@
+import numpy as np
+import pandas as pd
+
+import features.esm
+
+QUESTIONNAIRE_ID_SAM = {
+    "event_stress": 87,
+    "event_threat": 88,
+    "event_challenge": 89,
+    "event_time": 90,
+    "event_duration": 91,
+    "event_work_related": 92,
+    "period_stress": 93,
+}
+QUESTIONNAIRE_ID_SAM_LOW = min(QUESTIONNAIRE_ID_SAM.values())
+QUESTIONNAIRE_ID_SAM_HIGH = max(QUESTIONNAIRE_ID_SAM.values())
+
+GROUP_QUESTIONNAIRES_BY = [
+    "participant_id",
+    "device_id",
+    "esm_session",
+]
+# Each questionnaire occurs only once within each esm_session on the same device within the same participant.
+
+
+def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:
+    # 0. Select only questions from Stress Appraisal Measure.
+    df_esm_preprocessed = features.esm.preprocess_esm(df_esm)
+    df_esm_sam = df_esm_preprocessed[
+        (df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_ID_SAM_LOW)
+        & (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_ID_SAM_HIGH)
+    ]
+
+    df_esm_sam_clean = features.esm.clean_up_esm(df_esm_sam)
+    # 1.
+    df_esm_event_threat_challenge_mean_wide = calculate_threat_challenge_means(
+        df_esm_sam_clean
+    )
+    # 2.
+    df_esm_event_stress = detect_stressful_event(df_esm_sam_clean)
+
+    # Join to the previously calculated features related to the events.
+    df_esm_events = df_esm_event_threat_challenge_mean_wide.join(
+        df_esm_event_stress[
+            GROUP_QUESTIONNAIRES_BY + ["event_present", "event_stressfulness"]
+        ].set_index(GROUP_QUESTIONNAIRES_BY)
+    )
+
+    # 3.
+    df_esm_event_work_related = detect_event_work_related(df_esm_sam_clean)
+
+    df_esm_events = df_esm_events.join(
+        df_esm_event_work_related[
+            GROUP_QUESTIONNAIRES_BY + ["event_work_related"]
+        ].set_index(GROUP_QUESTIONNAIRES_BY)
+    )
+
+    # 4.
+    df_esm_event_time = convert_event_time(df_esm_sam_clean)
+
+    df_esm_events = df_esm_events.join(
+        df_esm_event_time[GROUP_QUESTIONNAIRES_BY + ["event_time"]].set_index(
+            GROUP_QUESTIONNAIRES_BY
+        )
+    )
+
+    # 5.
+    df_esm_event_duration = extract_event_duration(df_esm_sam_clean)
+
+    df_esm_events = df_esm_events.join(
+        df_esm_event_duration[
+            GROUP_QUESTIONNAIRES_BY + ["event_duration", "event_duration_info"]
+        ].set_index(GROUP_QUESTIONNAIRES_BY)
+    )
+
+    return df_esm_events
+
+
+def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
+    """
+    This function calculates challenge and threat (two Stress Appraisal Measure subscales) means,
+        for each ESM session (within participants and devices).
+    It creates a grouped dataframe with means in two columns.
+
+    Parameters
+    ----------
+    df_esm_sam_clean: pd.DataFrame
+        A cleaned up dataframe of Stress Appraisal Measure items.
+
+    Returns
+    -------
+    df_esm_event_threat_challenge_mean_wide: pd.DataFrame
+        A dataframe of unique ESM sessions (by participants and devices) with threat and challenge means.
+    """
+    # Select only threat and challenge assessments for events
+    df_esm_event_threat_challenge = df_esm_sam_clean[
+        (
+            df_esm_sam_clean["questionnaire_id"]
+            == QUESTIONNAIRE_ID_SAM.get("event_threat")
+        )
+        | (
+            df_esm_sam_clean["questionnaire_id"]
+            == QUESTIONNAIRE_ID_SAM.get("event_challenge")
+        )
+    ]
+    # Calculate mean of threat and challenge subscales for each ESM session.
+    df_esm_event_threat_challenge_mean_wide = pd.pivot_table(
+        df_esm_event_threat_challenge,
+        index=["participant_id", "device_id", "esm_session"],
+        columns=["questionnaire_id"],
+        values=["esm_user_answer_numeric"],
+        aggfunc="mean",
+    )
+    # Drop unnecessary column values.
+    df_esm_event_threat_challenge_mean_wide.columns = df_esm_event_threat_challenge_mean_wide.columns.get_level_values(
+        1
+    )
+    df_esm_event_threat_challenge_mean_wide.columns.name = None
+    df_esm_event_threat_challenge_mean_wide.rename(
+        columns={
+            QUESTIONNAIRE_ID_SAM.get("event_threat"): "threat_mean",
+            QUESTIONNAIRE_ID_SAM.get("event_challenge"): "challenge_mean",
+        },
+        inplace=True,
+    )
+    return df_esm_event_threat_challenge_mean_wide
+
+
+def detect_stressful_event(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
+    """
+    Participants were asked: "Was there a particular event that created tension in you?"
+    The following options were available:
+        0 - No,
+        1 - Yes, slightly,
+        2 - Yes, moderately,
+        3 - Yes, considerably,
+        4 - Yes, extremely.
+    This function indicates whether there was a stressful event (True/False)
+        and how stressful it was on a scale of 1 to 4.
+
+    Parameters
+    ----------
+    df_esm_sam_clean: pd.DataFrame
+        A cleaned up dataframe of Stress Appraisal Measure items.
+
+    Returns
+    -------
+    df_esm_event_stress: pd.DataFrame
+        The same dataframe with two new columns:
+            - event_present, indicating whether there was a stressful event at all,
+            - event_stressfulness, a numeric answer (1-4) to the single item question.
+
+    """
+    df_esm_event_stress = df_esm_sam_clean[
+        df_esm_sam_clean["questionnaire_id"] == QUESTIONNAIRE_ID_SAM.get("event_stress")
+    ]
+    df_esm_event_stress = df_esm_event_stress.assign(
+        event_present=lambda x: x.esm_user_answer_numeric > 0,
+        event_stressfulness=lambda x: x.esm_user_answer_numeric,
+    )
+    return df_esm_event_stress
+
+
+def detect_event_work_related(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
+    """
+    This function simply adds a column indicating the answer to the question:
+        "Was/is this event work-related?"
+
+    Parameters
+    ----------
+    df_esm_sam_clean: pd.DataFrame
+        A cleaned up dataframe of Stress Appraisal Measure items.
+
+    Returns
+    -------
+    df_esm_event_stress: pd.DataFrame
+        The same dataframe with a new column event_work_related (True/False).
+
+    """
+    df_esm_event_stress = df_esm_sam_clean[
+        df_esm_sam_clean["questionnaire_id"]
+        == QUESTIONNAIRE_ID_SAM.get("event_work_related")
+    ]
+    df_esm_event_stress = df_esm_event_stress.assign(
+        event_work_related=lambda x: x.esm_user_answer_numeric > 0
+    )
+    return df_esm_event_stress
+
+
+def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
+    """
+    This function only serves to convert the string datetime answer into a real datetime type.
+    Errors during this conversion are coerced, meaning that non-datetime answers are assigned Not a Time (NaT).
+    NOTE: Since the only available non-datetime answer to this question was "0 - I do not remember",
+        the NaTs can be interpreted to mean this.
+
+    Parameters
+    ----------
+    df_esm_sam_clean: pd.DataFrame
+        A cleaned up dataframe of Stress Appraisal Measure items.
+
+    Returns
+    -------
+    df_esm_event_time: pd.DataFrame
+        The same dataframe with a new column event_time of datetime type.
+    """
+    df_esm_event_time = df_esm_sam_clean[
+        df_esm_sam_clean["questionnaire_id"] == QUESTIONNAIRE_ID_SAM.get("event_time")
+    ].assign(
+        event_time=lambda x: pd.to_datetime(
+            x.esm_user_answer, errors="coerce", infer_datetime_format=True, exact=True
+        )
+    )
+    return df_esm_event_time
+
+
+def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
+    """
+    If participants indicated a stressful events, they were asked:
+        "How long did this event last? (Answer in hours and minutes)"
+    This function extracts this duration time and saves additional answers:
+        0 - I do not remember,
+        1 - It is still going on.
+
+    Parameters
+    ----------
+    df_esm_sam_clean: pd.DataFrame
+        A cleaned up dataframe of Stress Appraisal Measure items.
+
+    Returns
+    -------
+    df_esm_event_duration: pd.DataFrame
+        The same dataframe with two new columns:
+            - event_duration, a time part of a datetime,
+            - event_duration_info, giving other options to this question:
+                0 - I do not remember,
+                1 - It is still going on
+    """
+    df_esm_event_duration = df_esm_sam_clean[
+        df_esm_sam_clean["questionnaire_id"]
+        == QUESTIONNAIRE_ID_SAM.get("event_duration")
+    ].assign(
+        event_duration=lambda x: pd.to_datetime(
+            x.esm_user_answer.str.slice(start=0, stop=-6), errors="coerce"
+        ).dt.time
+    )
+    # TODO Explore the values recorded in event_duration and possibly fix mistakes.
+    # For example, participants reported setting 23:50:00 instead of 00:50:00.
+
+    # For the events that no duration was found (i.e. event_duration = NaT),
+    # we can determine whether:
+    #   - this event is still going on ("1 - It is still going on")
+    #   - the participant couldn't remember it's duration ("0 - I do not remember")
+    # Generally, these answers were converted to esm_user_answer_numeric in clean_up_esm,
+    # but only the numeric types of questions and answers.
+    # Since this was of "datetime" type, convert these specific answers here again.
+    df_esm_event_duration["event_duration_info"] = np.nan
+    df_esm_event_duration[
+        df_esm_event_duration.event_duration.isna()
+    ] = df_esm_event_duration[df_esm_event_duration.event_duration.isna()].assign(
+        event_duration_info=lambda x: x.esm_user_answer.str.slice(stop=1).astype(int)
+    )
+
+    return df_esm_event_duration
+
+
+# TODO: How many questions about the stressfulness of the period were asked and how does this relate to events?
--- a/features/helper.py
+++ b/features/helper.py
@ -0,0 +1,41 @@
+import datetime
+
+import pandas as pd
+from pytz import timezone
+
+TZ_LJ = timezone("Europe/Ljubljana")
+COLUMN_TIMESTAMP = "timestamp"
+COLUMN_TIMESTAMP_ESM = "double_esm_user_answer_timestamp"
+
+
+def get_date_from_timestamp(df_aware) -> pd.DataFrame:
+    """
+    Transform a UNIX timestamp into a datetime (with Ljubljana timezone).
+    Additionally, extract only the date part, where anything until 4 AM is considered the same day.
+
+    Parameters
+    ----------
+    df_aware: pd.DataFrame
+        Any AWARE-type data as defined in models.py.
+
+    Returns
+    -------
+    df_aware: pd.DataFrame
+        The same dataframe with datetime_lj and date_lj columns added.
+
+    """
+    if COLUMN_TIMESTAMP_ESM in df_aware:
+        column_timestamp = COLUMN_TIMESTAMP_ESM
+    else:
+        column_timestamp = COLUMN_TIMESTAMP
+
+    df_aware["datetime_lj"] = df_aware[column_timestamp].apply(
+        lambda x: datetime.datetime.fromtimestamp(x / 1000.0, tz=TZ_LJ)
+    )
+    df_aware = df_aware.assign(
+        date_lj=lambda x: (x.datetime_lj - datetime.timedelta(hours=4)).dt.date
+    )
+    # Since daytime EMAs could *theoretically* last beyond midnight, but never after 4 AM,
+    # the datetime is first translated to 4 h earlier.
+
+    return df_aware
--- a/features/proximity.py
+++ b/features/proximity.py
@ -28,3 +28,63 @@ def get_proximity_data(usernames: Collection) -> pd.DataFrame:
    with db_engine.connect() as connection:
        df_proximity = pd.read_sql(query_proximity.statement, connection)
    return df_proximity
+
+
+def recode_proximity(df_proximity: pd.DataFrame) -> pd.DataFrame:
+    """
+    This function recodes proximity from a double to a boolean value.
+    Different proximity sensors report different values,
+        but in our data only several distinct values have ever been found.
+    These are therefore converted into "near" and "far" binary values.
+    See expl_proximity.ipynb for additional info.
+
+    Parameters
+    ----------
+    df_proximity: pd.DataFrame
+        A dataframe of proximity data.
+
+    Returns
+    -------
+    df_proximity: pd.DataFrame
+        The same dataframe with an additional column bool_prox_near,
+            indicating whether "near" proximity was reported.
+        False values correspond to "far" reported by this sensor.
+
+    """
+    df_proximity = df_proximity.assign(bool_prox_near=lambda x: x.double_proximity == 0)
+    return df_proximity
+
+
+def count_proximity(
+    df_proximity: pd.DataFrame, group_by: Collection = ["participant_id"]
+) -> pd.DataFrame:
+    """
+    The function counts how many times a "near" value occurs in proximity
+        and calculates the proportion of this counts to all proximity values (i.e. relative count).
+
+    Parameters
+    ----------
+    df_proximity: pd.DataFrame
+        A dataframe of proximity data.
+    group_by: Collection
+        A list of strings, specifying by which parameters to group.
+        By default, the features are calculated per participant, but could be "date_lj" etc.
+
+    Returns
+    -------
+    df_proximity_features: pd.DataFrame
+        A dataframe with the count of "near" proximity values and their relative count.
+    """
+    if "bool_prox_near" not in df_proximity:
+        df_proximity = recode_proximity(df_proximity)
+    df_proximity["bool_prox_far"] = ~df_proximity["bool_prox_near"]
+    df_proximity_features = df_proximity.groupby(group_by).sum()[
+        ["bool_prox_near", "bool_prox_far"]
+    ]
+    df_proximity_features = df_proximity_features.assign(
+        prop_prox_near=lambda x: x.bool_prox_near / (x.bool_prox_near + x.bool_prox_far)
+    )
+    df_proximity_features = df_proximity_features.rename(
+        columns={"bool_prox_near": "freq_prox_near"}
+    ).drop(columns="bool_prox_far", inplace=False)
+    return df_proximity_features
--- a/machine_learning/init.py
+++ b/machine_learning/init.py
--- a/statistical_analysis/adherence.py
+++ b/statistical_analysis/adherence.py
@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.11.2
+#       jupytext_version: 1.11.4
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -14,12 +14,12 @@
 # ---

 # %%
+# %matplotlib inline
 import datetime
-
-# %%
 import os
 import sys

+import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 import statsmodels.api as sm
@ -31,6 +31,24 @@ if nb_dir not in sys.path:
 import participants.query_db
 from features.esm import *

+# %%
+SAVE_FIGS = True
+FIG_HEIGHT = 5
+FIG_ASPECT = 1.7
+FIG_COLOUR = "#28827C"
+
+SMALL_SIZE = 14
+MEDIUM_SIZE = SMALL_SIZE + 2
+BIGGER_SIZE = MEDIUM_SIZE + 2
+
+plt.rc("font", size=SMALL_SIZE)  # controls default text sizes
+plt.rc("axes", titlesize=SMALL_SIZE)  # fontsize of the axes title
+plt.rc("axes", labelsize=MEDIUM_SIZE)  # fontsize of the x and y labels
+plt.rc("xtick", labelsize=SMALL_SIZE)  # fontsize of the tick labels
+plt.rc("ytick", labelsize=SMALL_SIZE)  # fontsize of the tick labels
+plt.rc("legend", fontsize=SMALL_SIZE)  # legend fontsize
+plt.rc("figure", titlesize=BIGGER_SIZE)  # fontsize of the figure title
+
 # %%
 baseline_si = pd.read_csv("E:/STRAWbaseline/results-survey637813.csv")
 baseline_be_1 = pd.read_csv("E:/STRAWbaseline/results-survey358134.csv")
@ -130,7 +148,7 @@ df_adherence.describe()
 df_adherence[["gender", "startlanguage"]].value_counts()

 # %%
-sns.displot(df_adherence["finished_sessions"], binwidth=5, height=5)
+sns.displot(df_adherence["finished_sessions"], binwidth=5, height=FIG_HEIGHT)

 # %%
 lm_adherence = smf.ols(
@ -224,12 +242,14 @@ df_session_workday = df_session_workday.assign(
 g1 = sns.displot(
    df_session_workday["time_diff_minutes"],
    binwidth=5,
-    height=5,
-    aspect=1.5,
-    color="#28827C",
+    height=FIG_HEIGHT,
+    aspect=FIG_ASPECT,
+    color=FIG_COLOUR,
 )
 g1.set_axis_labels("Time difference [min]", "Session count")
-# g1.savefig("WorkdayEMAtimeDiff.pdf")
+g1.set(xlim=(0, 570))
+if SAVE_FIGS:
+    g1.savefig("WorkdayEMAtimeDiff.pdf")

 # %% [markdown]
 # There are some sessions that are really close together. By design, none should be closer than 30 min. Let's take a look at those.
@ -296,12 +316,13 @@ df_mean_daytime_interval.describe()
 g2 = sns.displot(
    df_mean_daytime_interval.time_diff_minutes,
    binwidth=5,
-    height=5,
-    aspect=1.5,
-    color="#28827C",
+    height=FIG_HEIGHT,
+    aspect=FIG_ASPECT,
+    color=FIG_COLOUR,
 )
 g2.set_axis_labels("Median time difference [min]", "Participant count")
-# g2.savefig("WorkdayEMAtimeDiffMedianParticip.pdf")
+if SAVE_FIGS:
+    g2.savefig("WorkdayEMAtimeDiffMedianParticip.pdf")

 # %%
 df_adherence = df_adherence.merge(
@ -327,9 +348,9 @@ df_count_daytime_per_participant["time"].describe()
 sns.displot(
    df_count_daytime_per_participant.time,
    binwidth=1,
-    height=5,
-    aspect=1.5,
-    color="#28827C",
+    height=FIG_HEIGHT,
+    aspect=FIG_ASPECT,
+    color=FIG_COLOUR,
 )

 # %% [markdown]
@ -364,13 +385,14 @@ s_evening_completed_ratio.describe()
 g3 = sns.displot(
    s_evening_completed_ratio - 0.001,
    binwidth=0.05,
-    height=5,
-    aspect=1.5,
-    color="#28827C",
+    height=FIG_HEIGHT,
+    aspect=FIG_ASPECT,
+    color=FIG_COLOUR,
 )
 g3.set_axis_labels("Ratio of days with the evening EMA filled out", "Participant count")
 g3.set(xlim=(1.01, 0.59))
-# g3.savefig("EveningEMAratioParticip.pdf")
+if SAVE_FIGS:
+    g3.savefig("EveningEMAratioParticip.pdf")

 # %%
 df_adherence = df_adherence.merge(
@ -386,5 +408,3 @@ lr_ols_evening_ratio = smf.ols(
 )
 ls_result_evening_ratio = lr_ols_evening_ratio.fit()
 ls_result_evening_ratio.summary()
-
-# %%
--- a/test/test_esm.py
+++ b/test/test_esm.py
@ -16,7 +16,16 @@ class EsmFeatures(unittest.TestCase):

    def test_preprocess_esm(self):
        self.esm_processed = preprocess_esm(self.esm)
+        # Check for columns which should have been extracted from esm_json.
        self.assertIn("question_id", self.esm_processed)
+        self.assertIn("questionnaire_id", self.esm_processed)
+        self.assertIn("esm_instructions", self.esm_processed)
+        self.assertIn("esm_type", self.esm_processed)
+        self.assertIn("time", self.esm_processed)
+        # Check for explicitly added column.
+        self.assertIn("datetime_lj", self.esm_processed)
+        # All of these keys are referenced in other functions, so they are expected to be present in preprocessed ESM.
+        # Since all of these are added in a single function, it should be OK to have many assert statements in one test.

    def test_classify_sessions_by_completion(self):
        self.esm_classified_sessions = classify_sessions_by_completion(
Author	SHA1	Message	Date
junos	577a874288	Add an example for linear regression.	2021-08-12 16:54:00 +02:00
junos	c8bb481508	Add a parameter for grouping.	2021-08-12 15:07:20 +02:00
junos	98f1df81c6	Use the same function for ESM and other data.	2021-08-11 17:26:44 +02:00
junos	ad85f79bc5	Move datetime calculation to a separate function.	2021-08-11 17:19:14 +02:00
junos	070cfdba80	Start machine learning pipeline example. Select data and labels.	2021-08-11 16:42:30 +02:00
junos	c6d0e4391e	Add a couple of proximity features.	2021-08-11 16:40:19 +02:00
junos	af65d0864f	Add a simple function for recoding proximity.	2021-08-11 15:04:27 +02:00
junos	a2180aee54	Fix assignment to use loc. For assigning a value to selected rows (a subset), regular slicing using [] produces a KeyError.	2021-08-11 14:53:59 +02:00
junos	a06ad0800f	Explore missing application categories.	2021-08-09 16:02:23 +02:00
junos	06e1fe7410	Merge remote-tracking branch 'origin/communication' into communication	2021-08-06 18:53:57 +02:00
junos	02f2607be9	Fix formatting and typos.	2021-08-06 18:53:39 +02:00
junos	cca5a29483	Rename features and add one for missed calls.	2021-08-06 18:53:39 +02:00
junos	e3d735163f	Add demonstrations of new functions.	2021-08-06 18:53:39 +02:00
Ivan Kobe	1b53865f0a	deleted prototyping notebooks	2021-08-06 18:53:39 +02:00
Ivan Kobe	4ac5f37c19	additional communication features	2021-08-06 18:53:39 +02:00
junos	2fc80a34e7	Fix formatting and typos.	2021-08-06 18:53:18 +02:00
junos	fbd9c2fc32	Rename features and add one for missed calls.	2021-08-06 18:51:13 +02:00
Junos Lukan	d8899fa75b	Merge branch 'communication' into 'master' Communication See merge request junoslukan/straw2analysis!1	2021-08-06 16:44:39 +00:00
Ivan Kobe	62af04fe09	Communication	2021-08-06 16:44:39 +00:00
junos	33ebf9caea	Add demonstrations of new functions.	2021-08-06 18:38:21 +02:00
junos	40293c4752	Further reduce figure height and increase font size.	2021-08-04 18:05:52 +02:00
junos	9e87b1f176	Add an option to print figures and set font sizes.	2021-08-04 17:41:09 +02:00
Ivan Kobe	4a2ca581b3	deleted prototyping notebooks	2021-08-04 13:46:44 +02:00
Ivan Kobe	d98b673824	additional communication features	2021-08-04 13:45:54 +02:00
junos	1bdb334c42	Fix formatting.	2021-07-27 20:57:21 +02:00
junos	b99136a181	Document individual functions for event extraction.	2021-07-27 20:56:27 +02:00
junos	9bd42afa02	Simplify pivoting a table and fix other mistakes.	2021-07-27 20:41:13 +02:00
junos	0f5af21f71	Detect whether event was work related.	2021-07-27 19:53:44 +02:00
junos	c4f7b6459d	Extract event duration.	2021-07-27 19:47:54 +02:00
junos	19cddaa634	Convert event time to datetime.	2021-07-27 18:43:31 +02:00
junos	763b970a42	Detect stressful events and rename their stressfulness.	2021-07-27 18:30:05 +02:00
junos	3c12a6e74a	Start extracting event features. Calculate threat and challenge means.	2021-07-27 18:25:05 +02:00
junos	28e9db15f5	Add assertions for fields which are referenced in other functions.	2021-07-27 14:07:10 +02:00