Compare commits

...

33 Commits

Author SHA1 Message Date
junos 577a874288 Add an example for linear regression. 2021-08-12 16:54:00 +02:00
junos c8bb481508 Add a parameter for grouping. 2021-08-12 15:07:20 +02:00
junos 98f1df81c6 Use the same function for ESM and other data. 2021-08-11 17:26:44 +02:00
junos ad85f79bc5 Move datetime calculation to a separate function. 2021-08-11 17:19:14 +02:00
junos 070cfdba80 Start machine learning pipeline example.
Select data and labels.
2021-08-11 16:42:30 +02:00
junos c6d0e4391e Add a couple of proximity features. 2021-08-11 16:40:19 +02:00
junos af65d0864f Add a simple function for recoding proximity. 2021-08-11 15:04:27 +02:00
junos a2180aee54 Fix assignment to use loc.
For assigning a value to selected rows (a subset), regular slicing using [] produces a KeyError.
2021-08-11 14:53:59 +02:00
junos a06ad0800f Explore missing application categories. 2021-08-09 16:02:23 +02:00
junos 06e1fe7410 Merge remote-tracking branch 'origin/communication' into communication 2021-08-06 18:53:57 +02:00
junos 02f2607be9 Fix formatting and typos. 2021-08-06 18:53:39 +02:00
junos cca5a29483 Rename features and add one for missed calls. 2021-08-06 18:53:39 +02:00
junos e3d735163f Add demonstrations of new functions. 2021-08-06 18:53:39 +02:00
Ivan Kobe 1b53865f0a deleted prototyping notebooks 2021-08-06 18:53:39 +02:00
Ivan Kobe 4ac5f37c19 additional communication features 2021-08-06 18:53:39 +02:00
junos 2fc80a34e7 Fix formatting and typos. 2021-08-06 18:53:18 +02:00
junos fbd9c2fc32 Rename features and add one for missed calls. 2021-08-06 18:51:13 +02:00
Junos Lukan d8899fa75b Merge branch 'communication' into 'master'
Communication

See merge request junoslukan/straw2analysis!1
2021-08-06 16:44:39 +00:00
Ivan Kobe 62af04fe09 Communication 2021-08-06 16:44:39 +00:00
junos 33ebf9caea Add demonstrations of new functions. 2021-08-06 18:38:21 +02:00
junos 40293c4752 Further reduce figure height and increase font size. 2021-08-04 18:05:52 +02:00
junos 9e87b1f176 Add an option to print figures and set font sizes. 2021-08-04 17:41:09 +02:00
Ivan Kobe 4a2ca581b3 deleted prototyping notebooks 2021-08-04 13:46:44 +02:00
Ivan Kobe d98b673824 additional communication features 2021-08-04 13:45:54 +02:00
junos 1bdb334c42 Fix formatting. 2021-07-27 20:57:21 +02:00
junos b99136a181 Document individual functions for event extraction. 2021-07-27 20:56:27 +02:00
junos 9bd42afa02 Simplify pivoting a table and fix other mistakes. 2021-07-27 20:41:13 +02:00
junos 0f5af21f71 Detect whether event was work related. 2021-07-27 19:53:44 +02:00
junos c4f7b6459d Extract event duration. 2021-07-27 19:47:54 +02:00
junos 19cddaa634 Convert event time to datetime. 2021-07-27 18:43:31 +02:00
junos 763b970a42 Detect stressful events and rename their stressfulness. 2021-07-27 18:30:05 +02:00
junos 3c12a6e74a Start extracting event features.
Calculate threat and challenge means.
2021-07-27 18:25:05 +02:00
junos 28e9db15f5 Add assertions for fields which are referenced in other functions. 2021-07-27 14:07:10 +02:00
13 changed files with 838 additions and 47 deletions

View File

@ -16,6 +16,7 @@ dependencies:
- python-dotenv
- pytz
- seaborn
- scikit-learn
- sqlalchemy
- statsmodels
- tabulate

View File

@ -0,0 +1,150 @@
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.11.4
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %%
# %matplotlib inline
import datetime
import os
import sys
import seaborn as sns
from sklearn import linear_model
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
# %%
import participants.query_db
from features import esm, helper, proximity
# %% [markdown]
# # 1. Get the relevant data
# %%
participants_inactive_usernames = participants.query_db.get_usernames(
collection_start=datetime.date.fromisoformat("2020-08-01")
)
# Consider only two participants to simplify.
ptcp_2 = participants_inactive_usernames[0:2]
# %% [markdown]
# ## 1.1 Labels
# %%
df_esm = esm.get_esm_data(ptcp_2)
df_esm_preprocessed = esm.preprocess_esm(df_esm)
# %%
df_esm_PANAS = df_esm_preprocessed[
(df_esm_preprocessed["questionnaire_id"] == 8)
| (df_esm_preprocessed["questionnaire_id"] == 9)
]
df_esm_PANAS_clean = esm.clean_up_esm(df_esm_PANAS)
# %% [markdown]
# ## 1.2 Sensor data
# %%
df_proximity = proximity.get_proximity_data(ptcp_2)
df_proximity = helper.get_date_from_timestamp(df_proximity)
df_proximity = proximity.recode_proximity(df_proximity)
# %% [markdown]
# ## 1.3 Standardization/personalization
# %% [markdown]
# # 2. Grouping/segmentation
# %%
df_esm_PANAS_daily_means = (
df_esm_PANAS_clean.groupby(["participant_id", "date_lj", "questionnaire_id"])
.esm_user_answer_numeric.agg("mean")
.reset_index()
.rename(columns={"esm_user_answer_numeric": "esm_numeric_mean"})
)
# %%
df_esm_PANAS_daily_means = (
df_esm_PANAS_daily_means.pivot(
index=["participant_id", "date_lj"],
columns="questionnaire_id",
values="esm_numeric_mean",
)
.reset_index(col_level=1)
.rename(columns={8.0: "PA", 9.0: "NA"})
.set_index(["participant_id", "date_lj"])
)
# %%
df_proximity_daily_counts = proximity.count_proximity(
df_proximity, ["participant_id", "date_lj"]
)
# %%
df_proximity_daily_counts
# %% [markdown]
# # 3. Join features (and export to csv?)
# %%
df_full_data_daily_means = df_esm_PANAS_daily_means.join(
df_proximity_daily_counts
).reset_index()
# %% [markdown]
# # 4. Machine learning model and parameters
# %%
lin_reg_proximity = linear_model.LinearRegression()
# %% [markdown]
# ## 4.1 Validation method
# %%
logo = LeaveOneGroupOut()
logo.get_n_splits(
df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
df_full_data_daily_means["PA"],
groups=df_full_data_daily_means["participant_id"],
)
# %% [markdown]
# ## 4.2 Fit results (export?)
# %%
cross_val_score(
lin_reg_proximity,
df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
df_full_data_daily_means["PA"],
groups=df_full_data_daily_means["participant_id"],
cv=logo,
n_jobs=-1,
scoring="r2",
)
# %%
lin_reg_proximity.fit(
df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
df_full_data_daily_means["PA"],
)
# %%
lin_reg_proximity.score(
df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
df_full_data_daily_means["PA"],
)

View File

@ -0,0 +1,76 @@
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.11.4
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %%
# %matplotlib inline
import os
import sys
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
# %%
from config.models import AppCategories, Participant
from setup import db_engine, session
# %%
query_app_categories = session.query(AppCategories)
with db_engine.connect() as connection:
df_app_categories = pd.read_sql(query_app_categories.statement, connection)
# %%
df_app_categories.head()
# %%
df_app_categories["play_store_genre"].value_counts()
# %%
df_category_not_found = df_app_categories[
df_app_categories["play_store_genre"] == "not_found"
]
# %%
df_category_not_found["play_store_response"].value_counts()
# %%
df_category_not_found["package_name"].value_counts()
# %%
manufacturers = [
"samsung",
"oneplus",
"huawei",
"xiaomi",
"lge",
"motorola",
"miui",
"lenovo",
"oppo",
"mediatek",
]
custom_rom = ["coloros", "lineageos", "myos", "cyanogenmod", "foundation.e"]
other = ["android", "wssyncmldm"]
rows_os_manufacturer = df_category_not_found["package_name"].str.contains(
"|".join(manufacturers + custom_rom + other), case=False
)
# %%
with pd.option_context("display.max_rows", None, "display.max_columns", None):
display(df_category_not_found.loc[~rows_os_manufacturer])

View File

@ -6,7 +6,7 @@
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.11.2
# jupytext_version: 1.11.4
# kernelspec:
# display_name: straw2analysis
# language: python
@ -14,6 +14,7 @@
# ---
# %%
# %matplotlib inline
import os
import sys
@ -53,6 +54,15 @@ import participants.query_db
participants_inactive_usernames = participants.query_db.get_usernames()
df_calls_inactive = get_call_data(participants_inactive_usernames)
# %%
participants_inactive_usernames
# %%
df_calls_inactive.head()
# %%
enumerate_contacts(df_calls_inactive).head()
# %%
df_calls_features = count_comms(df_calls_inactive)
df_calls_features.head()
@ -70,6 +80,9 @@ calls_number = pd.wide_to_long(
suffix="\D+",
)
# %%
calls_number
# %%
sns.displot(calls_number, x="no", hue="call_type", binwidth=5, element="step", height=8)
@ -126,3 +139,30 @@ sms_number = pd.wide_to_long(
sns.displot(
sms_number, x="no", hue="message_type", binwidth=5, element="step", height=8
)
# %% [markdown]
# # Communication features
# %%
df_calls_enumerated = enumerate_contacts(df_calls)
display(df_calls_enumerated)
# %%
df_calls_contact_features = contact_features(df_calls_enumerated)
display(df_calls_contact_features)
# %%
df_sms_enumerated = enumerate_contacts(df_sms)
df_sms_contact_features = contact_features(df_sms_enumerated)
display(df_sms_contact_features)
# %%
display(count_comms(df_calls))
# %%
display(count_comms(df_sms))
# %%
display(calls_sms_features(df_calls, df_sms))
# %%

View File

@ -6,7 +6,7 @@
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.11.2
# jupytext_version: 1.11.4
# kernelspec:
# display_name: straw2analysis
# language: python
@ -14,6 +14,7 @@
# ---
# %%
# %matplotlib inline
import os
import sys

View File

@ -86,7 +86,8 @@ def enumerate_contacts(comm_df: pd.DataFrame) -> pd.DataFrame:
# In other words, recode the contacts into integers from 0 to n_contacts,
# so that the first one is contacted the most often.
contact_ids = (
contact_counts.groupby("participant_id") # Group again for enumeration.
# Group again for enumeration.
contact_counts.groupby("participant_id")
.cumcount() # Enumerate (count) rows *within* participants.
.to_frame("contact_id")
)
@ -176,15 +177,148 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame:
return comm_features
def contact_features():
# TODO Implement a method that takes a DF with enumerated contacts as argument and calculates:
# * Duration of calls per caller (for most common callers)
# * Determine work vs non-work contacts by work hours heuristics
# * Number of people contacted
# And similarly for SMS.
pass
def contact_features(df_enumerated: pd.DataFrame) -> pd.DataFrame:
"""
Counts the number of people contacted (for each participant) and, if
df_enumerated is a dataframe containing calls data, the total duration
of calls between a participant and each of her contacts.
Parameters
----------
df_enumerated: pd.DataFrame
A dataframe of calls or SMSes; return of function enumerate_contacts.
Returns
-------
comm_df: pd.DataFrame
The altered dataframe with the column no_contacts and, if df_enumerated
contains calls data, an additional column total_call_duration.
"""
# Check whether df contains calls or SMS data since some
# features we want to calculate are type-specyfic
if "call_duration" in df_enumerated:
# Add a column with the total duration of calls between two people
duration_count = (
df_enumerated.groupby(["participant_id", "contact_id"])
# For each participant and for each caller, sum durations of their calls
["call_duration"]
.sum()
.reset_index() # Make index (which is actually the participant id) a normal column
.rename(columns={"call_duration": "total_call_duration"})
)
# The new dataframe now contains columns containing information about
# participants, callers and the total duration of their calls. All that
# is now left to do is to merge the original df with the new one.
df_enumerated = df_enumerated.merge(
duration_count, on=["participant_id", "contact_id"]
)
contact_count = (
df_enumerated.groupby(["participant_id"])
.nunique()[
"contact_id"
] # For each participant, count the number of distinct contacts
.reset_index() # Make index (which is actually the participant id) a normal column
.rename(columns={"contact_id": "no_contacts"})
)
df_enumerated = (
# Merge df with the newely created df containing info about number of contacts
df_enumerated.merge(contact_count, on="participant_id")
# Sort first by participant_id and then by contact_id and
# thereby restore the inital ordering of input dataframes.
.sort_values(["participant_id", "contact_id"])
)
# TODO:Determine work vs non-work contacts by work hours heuristics
return df_enumerated
def calls_sms_features():
# TODO Relate the calls and sms data, such as comparing the number of (missed) calls and messages.
pass
def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataFrame:
"""
Calculates additional features relating calls and sms data.
Parameters
----------
df_calls: pd.DataFrame
A dataframe of calls (return of get_call_data).
df_sms: pd.DataFrame
A dataframe of calls (return of get_sms_data).
Returns
-------
df_calls_sms: pd.DataFrame
The list of features relating calls and sms data for every participant.
These are:
* proportion_calls:
proportion of calls in total number of communications
* proportion_calls_incoming:
proportion of incoming calls in total number of incoming/received communications
* proportion_calls_outgoing:
proportion of outgoing calls in total number of outgoing/sent communications
* proportion_calls_missed_sms_received:
proportion of missed calls to the number of received messages
* proportion_calls_contacts:
proportion of calls contacts in total number of communication contacts
"""
count_calls = count_comms(df_calls)
count_sms = count_comms(df_sms)
count_joined = (
count_calls.merge(
count_sms, on="participant_id", suffixes=("_calls", "_sms")
) # Merge calls and sms features
.reset_index() # Make participant_id a regular column
.assign(
proportion_calls=(
lambda x: x.no_all_calls / (x.no_all_calls + x.no_all_sms)
),
proportion_calls_incoming=(
lambda x: x.no_incoming / (x.no_incoming + x.no_received)
),
proportion_calls_missed_sms_received=(
lambda x: x.no_missed / (x.no_missed + x.no_received)
),
proportion_calls_outgoing=(
lambda x: x.no_outgoing / (x.no_outgoing + x.no_sent)
)
# Calculate new features and create additional columns
)[
[
"participant_id",
"proportion_calls",
"proportion_calls_incoming",
"proportion_calls_outgoing",
"proportion_calls_missed_sms_received",
]
] # Filter out only the relevant features
)
features_calls = contact_features(enumerate_contacts(df_calls))
features_sms = contact_features(enumerate_contacts(df_sms))
features_joined = (
features_calls.merge(
features_sms, on="participant_id", suffixes=("_calls", "_sms")
) # Merge calls and sms features
.reset_index() # Make participant_id a regular column
.assign(
proportion_calls_contacts=(
lambda x: x.no_contacts_calls
/ (x.no_contacts_calls + x.no_contacts_sms)
) # Calculate new features and create additional columns
)[
["participant_id", "proportion_calls_contacts"]
] # Filter out only the relevant features
# Since we are interested only in some features and ignored
# others, a lot of duplicate rows were created. Remove them.
.drop_duplicates()
)
# Join the newly created dataframes
df_calls_sms = count_joined.merge(features_joined, on="participant_id")
return df_calls_sms

View File

@ -1,14 +1,12 @@
import datetime
from collections.abc import Collection
import numpy as np
import pandas as pd
from pytz import timezone
from config.models import ESM, Participant
from features import helper
from setup import db_engine, session
TZ_LJ = timezone("Europe/Ljubljana")
ESM_STATUS_ANSWERED = 2
GROUP_SESSIONS_BY = ["participant_id", "device_id", "esm_session"]
@ -67,14 +65,8 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
df_esm_preprocessed: pd.DataFrame
A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
"""
df_esm["datetime_lj"] = df_esm["double_esm_user_answer_timestamp"].apply(
lambda x: datetime.datetime.fromtimestamp(x / 1000.0, tz=TZ_LJ)
)
df_esm = df_esm.assign(
date_lj=lambda x: (x.datetime_lj - datetime.timedelta(hours=4)).dt.date
)
# Since daytime EMAs could *theoretically* last beyond midnight, but never after 4 AM,
# the datetime is first translated to 4 h earlier.
df_esm = helper.get_date_from_timestamp(df_esm)
df_esm_json = pd.json_normalize(df_esm["esm_json"]).drop(
columns=["esm_trigger"]
) # The esm_trigger column is already present in the main df.
@ -256,9 +248,9 @@ def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
ESM.ESM_TYPE.get("scale"),
ESM.ESM_TYPE.get("number"),
]
df_esm_clean[df_esm_clean["esm_type"].isin(esm_type_numeric)] = df_esm_clean[
df_esm_clean.loc[
df_esm_clean["esm_type"].isin(esm_type_numeric)
].assign(
] = df_esm_clean.loc[df_esm_clean["esm_type"].isin(esm_type_numeric)].assign(
esm_user_answer_numeric=lambda x: x.esm_user_answer.str.slice(stop=1).astype(
int
)

267
features/esm_SAM.py 100644
View File

@ -0,0 +1,267 @@
import numpy as np
import pandas as pd
import features.esm
QUESTIONNAIRE_ID_SAM = {
"event_stress": 87,
"event_threat": 88,
"event_challenge": 89,
"event_time": 90,
"event_duration": 91,
"event_work_related": 92,
"period_stress": 93,
}
QUESTIONNAIRE_ID_SAM_LOW = min(QUESTIONNAIRE_ID_SAM.values())
QUESTIONNAIRE_ID_SAM_HIGH = max(QUESTIONNAIRE_ID_SAM.values())
GROUP_QUESTIONNAIRES_BY = [
"participant_id",
"device_id",
"esm_session",
]
# Each questionnaire occurs only once within each esm_session on the same device within the same participant.
def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:
# 0. Select only questions from Stress Appraisal Measure.
df_esm_preprocessed = features.esm.preprocess_esm(df_esm)
df_esm_sam = df_esm_preprocessed[
(df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_ID_SAM_LOW)
& (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_ID_SAM_HIGH)
]
df_esm_sam_clean = features.esm.clean_up_esm(df_esm_sam)
# 1.
df_esm_event_threat_challenge_mean_wide = calculate_threat_challenge_means(
df_esm_sam_clean
)
# 2.
df_esm_event_stress = detect_stressful_event(df_esm_sam_clean)
# Join to the previously calculated features related to the events.
df_esm_events = df_esm_event_threat_challenge_mean_wide.join(
df_esm_event_stress[
GROUP_QUESTIONNAIRES_BY + ["event_present", "event_stressfulness"]
].set_index(GROUP_QUESTIONNAIRES_BY)
)
# 3.
df_esm_event_work_related = detect_event_work_related(df_esm_sam_clean)
df_esm_events = df_esm_events.join(
df_esm_event_work_related[
GROUP_QUESTIONNAIRES_BY + ["event_work_related"]
].set_index(GROUP_QUESTIONNAIRES_BY)
)
# 4.
df_esm_event_time = convert_event_time(df_esm_sam_clean)
df_esm_events = df_esm_events.join(
df_esm_event_time[GROUP_QUESTIONNAIRES_BY + ["event_time"]].set_index(
GROUP_QUESTIONNAIRES_BY
)
)
# 5.
df_esm_event_duration = extract_event_duration(df_esm_sam_clean)
df_esm_events = df_esm_events.join(
df_esm_event_duration[
GROUP_QUESTIONNAIRES_BY + ["event_duration", "event_duration_info"]
].set_index(GROUP_QUESTIONNAIRES_BY)
)
return df_esm_events
def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
"""
This function calculates challenge and threat (two Stress Appraisal Measure subscales) means,
for each ESM session (within participants and devices).
It creates a grouped dataframe with means in two columns.
Parameters
----------
df_esm_sam_clean: pd.DataFrame
A cleaned up dataframe of Stress Appraisal Measure items.
Returns
-------
df_esm_event_threat_challenge_mean_wide: pd.DataFrame
A dataframe of unique ESM sessions (by participants and devices) with threat and challenge means.
"""
# Select only threat and challenge assessments for events
df_esm_event_threat_challenge = df_esm_sam_clean[
(
df_esm_sam_clean["questionnaire_id"]
== QUESTIONNAIRE_ID_SAM.get("event_threat")
)
| (
df_esm_sam_clean["questionnaire_id"]
== QUESTIONNAIRE_ID_SAM.get("event_challenge")
)
]
# Calculate mean of threat and challenge subscales for each ESM session.
df_esm_event_threat_challenge_mean_wide = pd.pivot_table(
df_esm_event_threat_challenge,
index=["participant_id", "device_id", "esm_session"],
columns=["questionnaire_id"],
values=["esm_user_answer_numeric"],
aggfunc="mean",
)
# Drop unnecessary column values.
df_esm_event_threat_challenge_mean_wide.columns = df_esm_event_threat_challenge_mean_wide.columns.get_level_values(
1
)
df_esm_event_threat_challenge_mean_wide.columns.name = None
df_esm_event_threat_challenge_mean_wide.rename(
columns={
QUESTIONNAIRE_ID_SAM.get("event_threat"): "threat_mean",
QUESTIONNAIRE_ID_SAM.get("event_challenge"): "challenge_mean",
},
inplace=True,
)
return df_esm_event_threat_challenge_mean_wide
def detect_stressful_event(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
"""
Participants were asked: "Was there a particular event that created tension in you?"
The following options were available:
0 - No,
1 - Yes, slightly,
2 - Yes, moderately,
3 - Yes, considerably,
4 - Yes, extremely.
This function indicates whether there was a stressful event (True/False)
and how stressful it was on a scale of 1 to 4.
Parameters
----------
df_esm_sam_clean: pd.DataFrame
A cleaned up dataframe of Stress Appraisal Measure items.
Returns
-------
df_esm_event_stress: pd.DataFrame
The same dataframe with two new columns:
- event_present, indicating whether there was a stressful event at all,
- event_stressfulness, a numeric answer (1-4) to the single item question.
"""
df_esm_event_stress = df_esm_sam_clean[
df_esm_sam_clean["questionnaire_id"] == QUESTIONNAIRE_ID_SAM.get("event_stress")
]
df_esm_event_stress = df_esm_event_stress.assign(
event_present=lambda x: x.esm_user_answer_numeric > 0,
event_stressfulness=lambda x: x.esm_user_answer_numeric,
)
return df_esm_event_stress
def detect_event_work_related(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
"""
This function simply adds a column indicating the answer to the question:
"Was/is this event work-related?"
Parameters
----------
df_esm_sam_clean: pd.DataFrame
A cleaned up dataframe of Stress Appraisal Measure items.
Returns
-------
df_esm_event_stress: pd.DataFrame
The same dataframe with a new column event_work_related (True/False).
"""
df_esm_event_stress = df_esm_sam_clean[
df_esm_sam_clean["questionnaire_id"]
== QUESTIONNAIRE_ID_SAM.get("event_work_related")
]
df_esm_event_stress = df_esm_event_stress.assign(
event_work_related=lambda x: x.esm_user_answer_numeric > 0
)
return df_esm_event_stress
def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
"""
This function only serves to convert the string datetime answer into a real datetime type.
Errors during this conversion are coerced, meaning that non-datetime answers are assigned Not a Time (NaT).
NOTE: Since the only available non-datetime answer to this question was "0 - I do not remember",
the NaTs can be interpreted to mean this.
Parameters
----------
df_esm_sam_clean: pd.DataFrame
A cleaned up dataframe of Stress Appraisal Measure items.
Returns
-------
df_esm_event_time: pd.DataFrame
The same dataframe with a new column event_time of datetime type.
"""
df_esm_event_time = df_esm_sam_clean[
df_esm_sam_clean["questionnaire_id"] == QUESTIONNAIRE_ID_SAM.get("event_time")
].assign(
event_time=lambda x: pd.to_datetime(
x.esm_user_answer, errors="coerce", infer_datetime_format=True, exact=True
)
)
return df_esm_event_time
def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
"""
If participants indicated a stressful events, they were asked:
"How long did this event last? (Answer in hours and minutes)"
This function extracts this duration time and saves additional answers:
0 - I do not remember,
1 - It is still going on.
Parameters
----------
df_esm_sam_clean: pd.DataFrame
A cleaned up dataframe of Stress Appraisal Measure items.
Returns
-------
df_esm_event_duration: pd.DataFrame
The same dataframe with two new columns:
- event_duration, a time part of a datetime,
- event_duration_info, giving other options to this question:
0 - I do not remember,
1 - It is still going on
"""
df_esm_event_duration = df_esm_sam_clean[
df_esm_sam_clean["questionnaire_id"]
== QUESTIONNAIRE_ID_SAM.get("event_duration")
].assign(
event_duration=lambda x: pd.to_datetime(
x.esm_user_answer.str.slice(start=0, stop=-6), errors="coerce"
).dt.time
)
# TODO Explore the values recorded in event_duration and possibly fix mistakes.
# For example, participants reported setting 23:50:00 instead of 00:50:00.
# For the events that no duration was found (i.e. event_duration = NaT),
# we can determine whether:
# - this event is still going on ("1 - It is still going on")
# - the participant couldn't remember it's duration ("0 - I do not remember")
# Generally, these answers were converted to esm_user_answer_numeric in clean_up_esm,
# but only the numeric types of questions and answers.
# Since this was of "datetime" type, convert these specific answers here again.
df_esm_event_duration["event_duration_info"] = np.nan
df_esm_event_duration[
df_esm_event_duration.event_duration.isna()
] = df_esm_event_duration[df_esm_event_duration.event_duration.isna()].assign(
event_duration_info=lambda x: x.esm_user_answer.str.slice(stop=1).astype(int)
)
return df_esm_event_duration
# TODO: How many questions about the stressfulness of the period were asked and how does this relate to events?

41
features/helper.py 100644
View File

@ -0,0 +1,41 @@
import datetime
import pandas as pd
from pytz import timezone
TZ_LJ = timezone("Europe/Ljubljana")
COLUMN_TIMESTAMP = "timestamp"
COLUMN_TIMESTAMP_ESM = "double_esm_user_answer_timestamp"
def get_date_from_timestamp(df_aware) -> pd.DataFrame:
"""
Transform a UNIX timestamp into a datetime (with Ljubljana timezone).
Additionally, extract only the date part, where anything until 4 AM is considered the same day.
Parameters
----------
df_aware: pd.DataFrame
Any AWARE-type data as defined in models.py.
Returns
-------
df_aware: pd.DataFrame
The same dataframe with datetime_lj and date_lj columns added.
"""
if COLUMN_TIMESTAMP_ESM in df_aware:
column_timestamp = COLUMN_TIMESTAMP_ESM
else:
column_timestamp = COLUMN_TIMESTAMP
df_aware["datetime_lj"] = df_aware[column_timestamp].apply(
lambda x: datetime.datetime.fromtimestamp(x / 1000.0, tz=TZ_LJ)
)
df_aware = df_aware.assign(
date_lj=lambda x: (x.datetime_lj - datetime.timedelta(hours=4)).dt.date
)
# Since daytime EMAs could *theoretically* last beyond midnight, but never after 4 AM,
# the datetime is first translated to 4 h earlier.
return df_aware

View File

@ -28,3 +28,63 @@ def get_proximity_data(usernames: Collection) -> pd.DataFrame:
with db_engine.connect() as connection:
df_proximity = pd.read_sql(query_proximity.statement, connection)
return df_proximity
def recode_proximity(df_proximity: pd.DataFrame) -> pd.DataFrame:
"""
This function recodes proximity from a double to a boolean value.
Different proximity sensors report different values,
but in our data only several distinct values have ever been found.
These are therefore converted into "near" and "far" binary values.
See expl_proximity.ipynb for additional info.
Parameters
----------
df_proximity: pd.DataFrame
A dataframe of proximity data.
Returns
-------
df_proximity: pd.DataFrame
The same dataframe with an additional column bool_prox_near,
indicating whether "near" proximity was reported.
False values correspond to "far" reported by this sensor.
"""
df_proximity = df_proximity.assign(bool_prox_near=lambda x: x.double_proximity == 0)
return df_proximity
def count_proximity(
df_proximity: pd.DataFrame, group_by: Collection = ["participant_id"]
) -> pd.DataFrame:
"""
The function counts how many times a "near" value occurs in proximity
and calculates the proportion of this counts to all proximity values (i.e. relative count).
Parameters
----------
df_proximity: pd.DataFrame
A dataframe of proximity data.
group_by: Collection
A list of strings, specifying by which parameters to group.
By default, the features are calculated per participant, but could be "date_lj" etc.
Returns
-------
df_proximity_features: pd.DataFrame
A dataframe with the count of "near" proximity values and their relative count.
"""
if "bool_prox_near" not in df_proximity:
df_proximity = recode_proximity(df_proximity)
df_proximity["bool_prox_far"] = ~df_proximity["bool_prox_near"]
df_proximity_features = df_proximity.groupby(group_by).sum()[
["bool_prox_near", "bool_prox_far"]
]
df_proximity_features = df_proximity_features.assign(
prop_prox_near=lambda x: x.bool_prox_near / (x.bool_prox_near + x.bool_prox_far)
)
df_proximity_features = df_proximity_features.rename(
columns={"bool_prox_near": "freq_prox_near"}
).drop(columns="bool_prox_far", inplace=False)
return df_proximity_features

View File

View File

@ -6,7 +6,7 @@
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.11.2
# jupytext_version: 1.11.4
# kernelspec:
# display_name: straw2analysis
# language: python
@ -14,12 +14,12 @@
# ---
# %%
# %matplotlib inline
import datetime
# %%
import os
import sys
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
@ -31,6 +31,24 @@ if nb_dir not in sys.path:
import participants.query_db
from features.esm import *
# %%
SAVE_FIGS = True
FIG_HEIGHT = 5
FIG_ASPECT = 1.7
FIG_COLOUR = "#28827C"
SMALL_SIZE = 14
MEDIUM_SIZE = SMALL_SIZE + 2
BIGGER_SIZE = MEDIUM_SIZE + 2
plt.rc("font", size=SMALL_SIZE) # controls default text sizes
plt.rc("axes", titlesize=SMALL_SIZE) # fontsize of the axes title
plt.rc("axes", labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
plt.rc("xtick", labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc("ytick", labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc("legend", fontsize=SMALL_SIZE) # legend fontsize
plt.rc("figure", titlesize=BIGGER_SIZE) # fontsize of the figure title
# %%
baseline_si = pd.read_csv("E:/STRAWbaseline/results-survey637813.csv")
baseline_be_1 = pd.read_csv("E:/STRAWbaseline/results-survey358134.csv")
@ -130,7 +148,7 @@ df_adherence.describe()
df_adherence[["gender", "startlanguage"]].value_counts()
# %%
sns.displot(df_adherence["finished_sessions"], binwidth=5, height=5)
sns.displot(df_adherence["finished_sessions"], binwidth=5, height=FIG_HEIGHT)
# %%
lm_adherence = smf.ols(
@ -224,12 +242,14 @@ df_session_workday = df_session_workday.assign(
g1 = sns.displot(
df_session_workday["time_diff_minutes"],
binwidth=5,
height=5,
aspect=1.5,
color="#28827C",
height=FIG_HEIGHT,
aspect=FIG_ASPECT,
color=FIG_COLOUR,
)
g1.set_axis_labels("Time difference [min]", "Session count")
# g1.savefig("WorkdayEMAtimeDiff.pdf")
g1.set(xlim=(0, 570))
if SAVE_FIGS:
g1.savefig("WorkdayEMAtimeDiff.pdf")
# %% [markdown]
# There are some sessions that are really close together. By design, none should be closer than 30 min. Let's take a look at those.
@ -296,12 +316,13 @@ df_mean_daytime_interval.describe()
g2 = sns.displot(
df_mean_daytime_interval.time_diff_minutes,
binwidth=5,
height=5,
aspect=1.5,
color="#28827C",
height=FIG_HEIGHT,
aspect=FIG_ASPECT,
color=FIG_COLOUR,
)
g2.set_axis_labels("Median time difference [min]", "Participant count")
# g2.savefig("WorkdayEMAtimeDiffMedianParticip.pdf")
if SAVE_FIGS:
g2.savefig("WorkdayEMAtimeDiffMedianParticip.pdf")
# %%
df_adherence = df_adherence.merge(
@ -327,9 +348,9 @@ df_count_daytime_per_participant["time"].describe()
sns.displot(
df_count_daytime_per_participant.time,
binwidth=1,
height=5,
aspect=1.5,
color="#28827C",
height=FIG_HEIGHT,
aspect=FIG_ASPECT,
color=FIG_COLOUR,
)
# %% [markdown]
@ -364,13 +385,14 @@ s_evening_completed_ratio.describe()
g3 = sns.displot(
s_evening_completed_ratio - 0.001,
binwidth=0.05,
height=5,
aspect=1.5,
color="#28827C",
height=FIG_HEIGHT,
aspect=FIG_ASPECT,
color=FIG_COLOUR,
)
g3.set_axis_labels("Ratio of days with the evening EMA filled out", "Participant count")
g3.set(xlim=(1.01, 0.59))
# g3.savefig("EveningEMAratioParticip.pdf")
if SAVE_FIGS:
g3.savefig("EveningEMAratioParticip.pdf")
# %%
df_adherence = df_adherence.merge(
@ -386,5 +408,3 @@ lr_ols_evening_ratio = smf.ols(
)
ls_result_evening_ratio = lr_ols_evening_ratio.fit()
ls_result_evening_ratio.summary()
# %%

View File

@ -16,7 +16,16 @@ class EsmFeatures(unittest.TestCase):
def test_preprocess_esm(self):
self.esm_processed = preprocess_esm(self.esm)
# Check for columns which should have been extracted from esm_json.
self.assertIn("question_id", self.esm_processed)
self.assertIn("questionnaire_id", self.esm_processed)
self.assertIn("esm_instructions", self.esm_processed)
self.assertIn("esm_type", self.esm_processed)
self.assertIn("time", self.esm_processed)
# Check for explicitly added column.
self.assertIn("datetime_lj", self.esm_processed)
# All of these keys are referenced in other functions, so they are expected to be present in preprocessed ESM.
# Since all of these are added in a single function, it should be OK to have many assert statements in one test.
def test_classify_sessions_by_completion(self):
self.esm_classified_sessions = classify_sessions_by_completion(