Compare commits

..

No commits in common. "577a874288e84f5b57c4ae79b89712c67b1285ef" and "1aaf95fe9e3ca9d9bc578d847f1914329f5a27f8" have entirely different histories.

13 changed files with 47 additions and 838 deletions

View File

@ -16,7 +16,6 @@ dependencies:
- python-dotenv - python-dotenv
- pytz - pytz
- seaborn - seaborn
- scikit-learn
- sqlalchemy - sqlalchemy
- statsmodels - statsmodels
- tabulate - tabulate

View File

@ -1,150 +0,0 @@
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.11.4
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %%
# %matplotlib inline
import datetime
import os
import sys
import seaborn as sns
from sklearn import linear_model
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
# %%
import participants.query_db
from features import esm, helper, proximity
# %% [markdown]
# # 1. Get the relevant data
# %%
participants_inactive_usernames = participants.query_db.get_usernames(
collection_start=datetime.date.fromisoformat("2020-08-01")
)
# Consider only two participants to simplify.
ptcp_2 = participants_inactive_usernames[0:2]
# %% [markdown]
# ## 1.1 Labels
# %%
df_esm = esm.get_esm_data(ptcp_2)
df_esm_preprocessed = esm.preprocess_esm(df_esm)
# %%
df_esm_PANAS = df_esm_preprocessed[
(df_esm_preprocessed["questionnaire_id"] == 8)
| (df_esm_preprocessed["questionnaire_id"] == 9)
]
df_esm_PANAS_clean = esm.clean_up_esm(df_esm_PANAS)
# %% [markdown]
# ## 1.2 Sensor data
# %%
df_proximity = proximity.get_proximity_data(ptcp_2)
df_proximity = helper.get_date_from_timestamp(df_proximity)
df_proximity = proximity.recode_proximity(df_proximity)
# %% [markdown]
# ## 1.3 Standardization/personalization
# %% [markdown]
# # 2. Grouping/segmentation
# %%
df_esm_PANAS_daily_means = (
df_esm_PANAS_clean.groupby(["participant_id", "date_lj", "questionnaire_id"])
.esm_user_answer_numeric.agg("mean")
.reset_index()
.rename(columns={"esm_user_answer_numeric": "esm_numeric_mean"})
)
# %%
df_esm_PANAS_daily_means = (
df_esm_PANAS_daily_means.pivot(
index=["participant_id", "date_lj"],
columns="questionnaire_id",
values="esm_numeric_mean",
)
.reset_index(col_level=1)
.rename(columns={8.0: "PA", 9.0: "NA"})
.set_index(["participant_id", "date_lj"])
)
# %%
df_proximity_daily_counts = proximity.count_proximity(
df_proximity, ["participant_id", "date_lj"]
)
# %%
df_proximity_daily_counts
# %% [markdown]
# # 3. Join features (and export to csv?)
# %%
df_full_data_daily_means = df_esm_PANAS_daily_means.join(
df_proximity_daily_counts
).reset_index()
# %% [markdown]
# # 4. Machine learning model and parameters
# %%
lin_reg_proximity = linear_model.LinearRegression()
# %% [markdown]
# ## 4.1 Validation method
# %%
logo = LeaveOneGroupOut()
logo.get_n_splits(
df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
df_full_data_daily_means["PA"],
groups=df_full_data_daily_means["participant_id"],
)
# %% [markdown]
# ## 4.2 Fit results (export?)
# %%
cross_val_score(
lin_reg_proximity,
df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
df_full_data_daily_means["PA"],
groups=df_full_data_daily_means["participant_id"],
cv=logo,
n_jobs=-1,
scoring="r2",
)
# %%
lin_reg_proximity.fit(
df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
df_full_data_daily_means["PA"],
)
# %%
lin_reg_proximity.score(
df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
df_full_data_daily_means["PA"],
)

View File

@ -1,76 +0,0 @@
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.11.4
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %%
# %matplotlib inline
import os
import sys
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
# %%
from config.models import AppCategories, Participant
from setup import db_engine, session
# %%
query_app_categories = session.query(AppCategories)
with db_engine.connect() as connection:
df_app_categories = pd.read_sql(query_app_categories.statement, connection)
# %%
df_app_categories.head()
# %%
df_app_categories["play_store_genre"].value_counts()
# %%
df_category_not_found = df_app_categories[
df_app_categories["play_store_genre"] == "not_found"
]
# %%
df_category_not_found["play_store_response"].value_counts()
# %%
df_category_not_found["package_name"].value_counts()
# %%
manufacturers = [
"samsung",
"oneplus",
"huawei",
"xiaomi",
"lge",
"motorola",
"miui",
"lenovo",
"oppo",
"mediatek",
]
custom_rom = ["coloros", "lineageos", "myos", "cyanogenmod", "foundation.e"]
other = ["android", "wssyncmldm"]
rows_os_manufacturer = df_category_not_found["package_name"].str.contains(
"|".join(manufacturers + custom_rom + other), case=False
)
# %%
with pd.option_context("display.max_rows", None, "display.max_columns", None):
display(df_category_not_found.loc[~rows_os_manufacturer])

View File

@ -6,7 +6,7 @@
# extension: .py # extension: .py
# format_name: percent # format_name: percent
# format_version: '1.3' # format_version: '1.3'
# jupytext_version: 1.11.4 # jupytext_version: 1.11.2
# kernelspec: # kernelspec:
# display_name: straw2analysis # display_name: straw2analysis
# language: python # language: python
@ -14,7 +14,6 @@
# --- # ---
# %% # %%
# %matplotlib inline
import os import os
import sys import sys
@ -54,15 +53,6 @@ import participants.query_db
participants_inactive_usernames = participants.query_db.get_usernames() participants_inactive_usernames = participants.query_db.get_usernames()
df_calls_inactive = get_call_data(participants_inactive_usernames) df_calls_inactive = get_call_data(participants_inactive_usernames)
# %%
participants_inactive_usernames
# %%
df_calls_inactive.head()
# %%
enumerate_contacts(df_calls_inactive).head()
# %% # %%
df_calls_features = count_comms(df_calls_inactive) df_calls_features = count_comms(df_calls_inactive)
df_calls_features.head() df_calls_features.head()
@ -80,9 +70,6 @@ calls_number = pd.wide_to_long(
suffix="\D+", suffix="\D+",
) )
# %%
calls_number
# %% # %%
sns.displot(calls_number, x="no", hue="call_type", binwidth=5, element="step", height=8) sns.displot(calls_number, x="no", hue="call_type", binwidth=5, element="step", height=8)
@ -139,30 +126,3 @@ sms_number = pd.wide_to_long(
sns.displot( sns.displot(
sms_number, x="no", hue="message_type", binwidth=5, element="step", height=8 sms_number, x="no", hue="message_type", binwidth=5, element="step", height=8
) )
# %% [markdown]
# # Communication features
# %%
df_calls_enumerated = enumerate_contacts(df_calls)
display(df_calls_enumerated)
# %%
df_calls_contact_features = contact_features(df_calls_enumerated)
display(df_calls_contact_features)
# %%
df_sms_enumerated = enumerate_contacts(df_sms)
df_sms_contact_features = contact_features(df_sms_enumerated)
display(df_sms_contact_features)
# %%
display(count_comms(df_calls))
# %%
display(count_comms(df_sms))
# %%
display(calls_sms_features(df_calls, df_sms))
# %%

View File

@ -6,7 +6,7 @@
# extension: .py # extension: .py
# format_name: percent # format_name: percent
# format_version: '1.3' # format_version: '1.3'
# jupytext_version: 1.11.4 # jupytext_version: 1.11.2
# kernelspec: # kernelspec:
# display_name: straw2analysis # display_name: straw2analysis
# language: python # language: python
@ -14,7 +14,6 @@
# --- # ---
# %% # %%
# %matplotlib inline
import os import os
import sys import sys

View File

@ -86,8 +86,7 @@ def enumerate_contacts(comm_df: pd.DataFrame) -> pd.DataFrame:
# In other words, recode the contacts into integers from 0 to n_contacts, # In other words, recode the contacts into integers from 0 to n_contacts,
# so that the first one is contacted the most often. # so that the first one is contacted the most often.
contact_ids = ( contact_ids = (
# Group again for enumeration. contact_counts.groupby("participant_id") # Group again for enumeration.
contact_counts.groupby("participant_id")
.cumcount() # Enumerate (count) rows *within* participants. .cumcount() # Enumerate (count) rows *within* participants.
.to_frame("contact_id") .to_frame("contact_id")
) )
@ -177,148 +176,15 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame:
return comm_features return comm_features
def contact_features(df_enumerated: pd.DataFrame) -> pd.DataFrame: def contact_features():
""" # TODO Implement a method that takes a DF with enumerated contacts as argument and calculates:
Counts the number of people contacted (for each participant) and, if # * Duration of calls per caller (for most common callers)
df_enumerated is a dataframe containing calls data, the total duration # * Determine work vs non-work contacts by work hours heuristics
of calls between a participant and each of her contacts. # * Number of people contacted
# And similarly for SMS.
Parameters pass
----------
df_enumerated: pd.DataFrame
A dataframe of calls or SMSes; return of function enumerate_contacts.
Returns
-------
comm_df: pd.DataFrame
The altered dataframe with the column no_contacts and, if df_enumerated
contains calls data, an additional column total_call_duration.
"""
# Check whether df contains calls or SMS data since some
# features we want to calculate are type-specyfic
if "call_duration" in df_enumerated:
# Add a column with the total duration of calls between two people
duration_count = (
df_enumerated.groupby(["participant_id", "contact_id"])
# For each participant and for each caller, sum durations of their calls
["call_duration"]
.sum()
.reset_index() # Make index (which is actually the participant id) a normal column
.rename(columns={"call_duration": "total_call_duration"})
)
# The new dataframe now contains columns containing information about
# participants, callers and the total duration of their calls. All that
# is now left to do is to merge the original df with the new one.
df_enumerated = df_enumerated.merge(
duration_count, on=["participant_id", "contact_id"]
)
contact_count = (
df_enumerated.groupby(["participant_id"])
.nunique()[
"contact_id"
] # For each participant, count the number of distinct contacts
.reset_index() # Make index (which is actually the participant id) a normal column
.rename(columns={"contact_id": "no_contacts"})
)
df_enumerated = (
# Merge df with the newely created df containing info about number of contacts
df_enumerated.merge(contact_count, on="participant_id")
# Sort first by participant_id and then by contact_id and
# thereby restore the inital ordering of input dataframes.
.sort_values(["participant_id", "contact_id"])
)
# TODO:Determine work vs non-work contacts by work hours heuristics
return df_enumerated
def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataFrame: def calls_sms_features():
""" # TODO Relate the calls and sms data, such as comparing the number of (missed) calls and messages.
Calculates additional features relating calls and sms data. pass
Parameters
----------
df_calls: pd.DataFrame
A dataframe of calls (return of get_call_data).
df_sms: pd.DataFrame
A dataframe of calls (return of get_sms_data).
Returns
-------
df_calls_sms: pd.DataFrame
The list of features relating calls and sms data for every participant.
These are:
* proportion_calls:
proportion of calls in total number of communications
* proportion_calls_incoming:
proportion of incoming calls in total number of incoming/received communications
* proportion_calls_outgoing:
proportion of outgoing calls in total number of outgoing/sent communications
* proportion_calls_missed_sms_received:
proportion of missed calls to the number of received messages
* proportion_calls_contacts:
proportion of calls contacts in total number of communication contacts
"""
count_calls = count_comms(df_calls)
count_sms = count_comms(df_sms)
count_joined = (
count_calls.merge(
count_sms, on="participant_id", suffixes=("_calls", "_sms")
) # Merge calls and sms features
.reset_index() # Make participant_id a regular column
.assign(
proportion_calls=(
lambda x: x.no_all_calls / (x.no_all_calls + x.no_all_sms)
),
proportion_calls_incoming=(
lambda x: x.no_incoming / (x.no_incoming + x.no_received)
),
proportion_calls_missed_sms_received=(
lambda x: x.no_missed / (x.no_missed + x.no_received)
),
proportion_calls_outgoing=(
lambda x: x.no_outgoing / (x.no_outgoing + x.no_sent)
)
# Calculate new features and create additional columns
)[
[
"participant_id",
"proportion_calls",
"proportion_calls_incoming",
"proportion_calls_outgoing",
"proportion_calls_missed_sms_received",
]
] # Filter out only the relevant features
)
features_calls = contact_features(enumerate_contacts(df_calls))
features_sms = contact_features(enumerate_contacts(df_sms))
features_joined = (
features_calls.merge(
features_sms, on="participant_id", suffixes=("_calls", "_sms")
) # Merge calls and sms features
.reset_index() # Make participant_id a regular column
.assign(
proportion_calls_contacts=(
lambda x: x.no_contacts_calls
/ (x.no_contacts_calls + x.no_contacts_sms)
) # Calculate new features and create additional columns
)[
["participant_id", "proportion_calls_contacts"]
] # Filter out only the relevant features
# Since we are interested only in some features and ignored
# others, a lot of duplicate rows were created. Remove them.
.drop_duplicates()
)
# Join the newly created dataframes
df_calls_sms = count_joined.merge(features_joined, on="participant_id")
return df_calls_sms

View File

@ -1,12 +1,14 @@
import datetime
from collections.abc import Collection from collections.abc import Collection
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from pytz import timezone
from config.models import ESM, Participant from config.models import ESM, Participant
from features import helper
from setup import db_engine, session from setup import db_engine, session
TZ_LJ = timezone("Europe/Ljubljana")
ESM_STATUS_ANSWERED = 2 ESM_STATUS_ANSWERED = 2
GROUP_SESSIONS_BY = ["participant_id", "device_id", "esm_session"] GROUP_SESSIONS_BY = ["participant_id", "device_id", "esm_session"]
@ -65,8 +67,14 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
df_esm_preprocessed: pd.DataFrame df_esm_preprocessed: pd.DataFrame
A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column. A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
""" """
df_esm = helper.get_date_from_timestamp(df_esm) df_esm["datetime_lj"] = df_esm["double_esm_user_answer_timestamp"].apply(
lambda x: datetime.datetime.fromtimestamp(x / 1000.0, tz=TZ_LJ)
)
df_esm = df_esm.assign(
date_lj=lambda x: (x.datetime_lj - datetime.timedelta(hours=4)).dt.date
)
# Since daytime EMAs could *theoretically* last beyond midnight, but never after 4 AM,
# the datetime is first translated to 4 h earlier.
df_esm_json = pd.json_normalize(df_esm["esm_json"]).drop( df_esm_json = pd.json_normalize(df_esm["esm_json"]).drop(
columns=["esm_trigger"] columns=["esm_trigger"]
) # The esm_trigger column is already present in the main df. ) # The esm_trigger column is already present in the main df.
@ -248,9 +256,9 @@ def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
ESM.ESM_TYPE.get("scale"), ESM.ESM_TYPE.get("scale"),
ESM.ESM_TYPE.get("number"), ESM.ESM_TYPE.get("number"),
] ]
df_esm_clean.loc[ df_esm_clean[df_esm_clean["esm_type"].isin(esm_type_numeric)] = df_esm_clean[
df_esm_clean["esm_type"].isin(esm_type_numeric) df_esm_clean["esm_type"].isin(esm_type_numeric)
] = df_esm_clean.loc[df_esm_clean["esm_type"].isin(esm_type_numeric)].assign( ].assign(
esm_user_answer_numeric=lambda x: x.esm_user_answer.str.slice(stop=1).astype( esm_user_answer_numeric=lambda x: x.esm_user_answer.str.slice(stop=1).astype(
int int
) )

View File

@ -1,267 +0,0 @@
import numpy as np
import pandas as pd
import features.esm
QUESTIONNAIRE_ID_SAM = {
"event_stress": 87,
"event_threat": 88,
"event_challenge": 89,
"event_time": 90,
"event_duration": 91,
"event_work_related": 92,
"period_stress": 93,
}
QUESTIONNAIRE_ID_SAM_LOW = min(QUESTIONNAIRE_ID_SAM.values())
QUESTIONNAIRE_ID_SAM_HIGH = max(QUESTIONNAIRE_ID_SAM.values())
GROUP_QUESTIONNAIRES_BY = [
"participant_id",
"device_id",
"esm_session",
]
# Each questionnaire occurs only once within each esm_session on the same device within the same participant.
def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:
# 0. Select only questions from Stress Appraisal Measure.
df_esm_preprocessed = features.esm.preprocess_esm(df_esm)
df_esm_sam = df_esm_preprocessed[
(df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_ID_SAM_LOW)
& (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_ID_SAM_HIGH)
]
df_esm_sam_clean = features.esm.clean_up_esm(df_esm_sam)
# 1.
df_esm_event_threat_challenge_mean_wide = calculate_threat_challenge_means(
df_esm_sam_clean
)
# 2.
df_esm_event_stress = detect_stressful_event(df_esm_sam_clean)
# Join to the previously calculated features related to the events.
df_esm_events = df_esm_event_threat_challenge_mean_wide.join(
df_esm_event_stress[
GROUP_QUESTIONNAIRES_BY + ["event_present", "event_stressfulness"]
].set_index(GROUP_QUESTIONNAIRES_BY)
)
# 3.
df_esm_event_work_related = detect_event_work_related(df_esm_sam_clean)
df_esm_events = df_esm_events.join(
df_esm_event_work_related[
GROUP_QUESTIONNAIRES_BY + ["event_work_related"]
].set_index(GROUP_QUESTIONNAIRES_BY)
)
# 4.
df_esm_event_time = convert_event_time(df_esm_sam_clean)
df_esm_events = df_esm_events.join(
df_esm_event_time[GROUP_QUESTIONNAIRES_BY + ["event_time"]].set_index(
GROUP_QUESTIONNAIRES_BY
)
)
# 5.
df_esm_event_duration = extract_event_duration(df_esm_sam_clean)
df_esm_events = df_esm_events.join(
df_esm_event_duration[
GROUP_QUESTIONNAIRES_BY + ["event_duration", "event_duration_info"]
].set_index(GROUP_QUESTIONNAIRES_BY)
)
return df_esm_events
def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
"""
This function calculates challenge and threat (two Stress Appraisal Measure subscales) means,
for each ESM session (within participants and devices).
It creates a grouped dataframe with means in two columns.
Parameters
----------
df_esm_sam_clean: pd.DataFrame
A cleaned up dataframe of Stress Appraisal Measure items.
Returns
-------
df_esm_event_threat_challenge_mean_wide: pd.DataFrame
A dataframe of unique ESM sessions (by participants and devices) with threat and challenge means.
"""
# Select only threat and challenge assessments for events
df_esm_event_threat_challenge = df_esm_sam_clean[
(
df_esm_sam_clean["questionnaire_id"]
== QUESTIONNAIRE_ID_SAM.get("event_threat")
)
| (
df_esm_sam_clean["questionnaire_id"]
== QUESTIONNAIRE_ID_SAM.get("event_challenge")
)
]
# Calculate mean of threat and challenge subscales for each ESM session.
df_esm_event_threat_challenge_mean_wide = pd.pivot_table(
df_esm_event_threat_challenge,
index=["participant_id", "device_id", "esm_session"],
columns=["questionnaire_id"],
values=["esm_user_answer_numeric"],
aggfunc="mean",
)
# Drop unnecessary column values.
df_esm_event_threat_challenge_mean_wide.columns = df_esm_event_threat_challenge_mean_wide.columns.get_level_values(
1
)
df_esm_event_threat_challenge_mean_wide.columns.name = None
df_esm_event_threat_challenge_mean_wide.rename(
columns={
QUESTIONNAIRE_ID_SAM.get("event_threat"): "threat_mean",
QUESTIONNAIRE_ID_SAM.get("event_challenge"): "challenge_mean",
},
inplace=True,
)
return df_esm_event_threat_challenge_mean_wide
def detect_stressful_event(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
"""
Participants were asked: "Was there a particular event that created tension in you?"
The following options were available:
0 - No,
1 - Yes, slightly,
2 - Yes, moderately,
3 - Yes, considerably,
4 - Yes, extremely.
This function indicates whether there was a stressful event (True/False)
and how stressful it was on a scale of 1 to 4.
Parameters
----------
df_esm_sam_clean: pd.DataFrame
A cleaned up dataframe of Stress Appraisal Measure items.
Returns
-------
df_esm_event_stress: pd.DataFrame
The same dataframe with two new columns:
- event_present, indicating whether there was a stressful event at all,
- event_stressfulness, a numeric answer (1-4) to the single item question.
"""
df_esm_event_stress = df_esm_sam_clean[
df_esm_sam_clean["questionnaire_id"] == QUESTIONNAIRE_ID_SAM.get("event_stress")
]
df_esm_event_stress = df_esm_event_stress.assign(
event_present=lambda x: x.esm_user_answer_numeric > 0,
event_stressfulness=lambda x: x.esm_user_answer_numeric,
)
return df_esm_event_stress
def detect_event_work_related(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
"""
This function simply adds a column indicating the answer to the question:
"Was/is this event work-related?"
Parameters
----------
df_esm_sam_clean: pd.DataFrame
A cleaned up dataframe of Stress Appraisal Measure items.
Returns
-------
df_esm_event_stress: pd.DataFrame
The same dataframe with a new column event_work_related (True/False).
"""
df_esm_event_stress = df_esm_sam_clean[
df_esm_sam_clean["questionnaire_id"]
== QUESTIONNAIRE_ID_SAM.get("event_work_related")
]
df_esm_event_stress = df_esm_event_stress.assign(
event_work_related=lambda x: x.esm_user_answer_numeric > 0
)
return df_esm_event_stress
def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
"""
This function only serves to convert the string datetime answer into a real datetime type.
Errors during this conversion are coerced, meaning that non-datetime answers are assigned Not a Time (NaT).
NOTE: Since the only available non-datetime answer to this question was "0 - I do not remember",
the NaTs can be interpreted to mean this.
Parameters
----------
df_esm_sam_clean: pd.DataFrame
A cleaned up dataframe of Stress Appraisal Measure items.
Returns
-------
df_esm_event_time: pd.DataFrame
The same dataframe with a new column event_time of datetime type.
"""
df_esm_event_time = df_esm_sam_clean[
df_esm_sam_clean["questionnaire_id"] == QUESTIONNAIRE_ID_SAM.get("event_time")
].assign(
event_time=lambda x: pd.to_datetime(
x.esm_user_answer, errors="coerce", infer_datetime_format=True, exact=True
)
)
return df_esm_event_time
def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
"""
If participants indicated a stressful events, they were asked:
"How long did this event last? (Answer in hours and minutes)"
This function extracts this duration time and saves additional answers:
0 - I do not remember,
1 - It is still going on.
Parameters
----------
df_esm_sam_clean: pd.DataFrame
A cleaned up dataframe of Stress Appraisal Measure items.
Returns
-------
df_esm_event_duration: pd.DataFrame
The same dataframe with two new columns:
- event_duration, a time part of a datetime,
- event_duration_info, giving other options to this question:
0 - I do not remember,
1 - It is still going on
"""
df_esm_event_duration = df_esm_sam_clean[
df_esm_sam_clean["questionnaire_id"]
== QUESTIONNAIRE_ID_SAM.get("event_duration")
].assign(
event_duration=lambda x: pd.to_datetime(
x.esm_user_answer.str.slice(start=0, stop=-6), errors="coerce"
).dt.time
)
# TODO Explore the values recorded in event_duration and possibly fix mistakes.
# For example, participants reported setting 23:50:00 instead of 00:50:00.
# For the events that no duration was found (i.e. event_duration = NaT),
# we can determine whether:
# - this event is still going on ("1 - It is still going on")
# - the participant couldn't remember it's duration ("0 - I do not remember")
# Generally, these answers were converted to esm_user_answer_numeric in clean_up_esm,
# but only the numeric types of questions and answers.
# Since this was of "datetime" type, convert these specific answers here again.
df_esm_event_duration["event_duration_info"] = np.nan
df_esm_event_duration[
df_esm_event_duration.event_duration.isna()
] = df_esm_event_duration[df_esm_event_duration.event_duration.isna()].assign(
event_duration_info=lambda x: x.esm_user_answer.str.slice(stop=1).astype(int)
)
return df_esm_event_duration
# TODO: How many questions about the stressfulness of the period were asked and how does this relate to events?

View File

@ -1,41 +0,0 @@
import datetime
import pandas as pd
from pytz import timezone
TZ_LJ = timezone("Europe/Ljubljana")
COLUMN_TIMESTAMP = "timestamp"
COLUMN_TIMESTAMP_ESM = "double_esm_user_answer_timestamp"
def get_date_from_timestamp(df_aware) -> pd.DataFrame:
"""
Transform a UNIX timestamp into a datetime (with Ljubljana timezone).
Additionally, extract only the date part, where anything until 4 AM is considered the same day.
Parameters
----------
df_aware: pd.DataFrame
Any AWARE-type data as defined in models.py.
Returns
-------
df_aware: pd.DataFrame
The same dataframe with datetime_lj and date_lj columns added.
"""
if COLUMN_TIMESTAMP_ESM in df_aware:
column_timestamp = COLUMN_TIMESTAMP_ESM
else:
column_timestamp = COLUMN_TIMESTAMP
df_aware["datetime_lj"] = df_aware[column_timestamp].apply(
lambda x: datetime.datetime.fromtimestamp(x / 1000.0, tz=TZ_LJ)
)
df_aware = df_aware.assign(
date_lj=lambda x: (x.datetime_lj - datetime.timedelta(hours=4)).dt.date
)
# Since daytime EMAs could *theoretically* last beyond midnight, but never after 4 AM,
# the datetime is first translated to 4 h earlier.
return df_aware

View File

@ -28,63 +28,3 @@ def get_proximity_data(usernames: Collection) -> pd.DataFrame:
with db_engine.connect() as connection: with db_engine.connect() as connection:
df_proximity = pd.read_sql(query_proximity.statement, connection) df_proximity = pd.read_sql(query_proximity.statement, connection)
return df_proximity return df_proximity
def recode_proximity(df_proximity: pd.DataFrame) -> pd.DataFrame:
"""
This function recodes proximity from a double to a boolean value.
Different proximity sensors report different values,
but in our data only several distinct values have ever been found.
These are therefore converted into "near" and "far" binary values.
See expl_proximity.ipynb for additional info.
Parameters
----------
df_proximity: pd.DataFrame
A dataframe of proximity data.
Returns
-------
df_proximity: pd.DataFrame
The same dataframe with an additional column bool_prox_near,
indicating whether "near" proximity was reported.
False values correspond to "far" reported by this sensor.
"""
df_proximity = df_proximity.assign(bool_prox_near=lambda x: x.double_proximity == 0)
return df_proximity
def count_proximity(
df_proximity: pd.DataFrame, group_by: Collection = ["participant_id"]
) -> pd.DataFrame:
"""
The function counts how many times a "near" value occurs in proximity
and calculates the proportion of this counts to all proximity values (i.e. relative count).
Parameters
----------
df_proximity: pd.DataFrame
A dataframe of proximity data.
group_by: Collection
A list of strings, specifying by which parameters to group.
By default, the features are calculated per participant, but could be "date_lj" etc.
Returns
-------
df_proximity_features: pd.DataFrame
A dataframe with the count of "near" proximity values and their relative count.
"""
if "bool_prox_near" not in df_proximity:
df_proximity = recode_proximity(df_proximity)
df_proximity["bool_prox_far"] = ~df_proximity["bool_prox_near"]
df_proximity_features = df_proximity.groupby(group_by).sum()[
["bool_prox_near", "bool_prox_far"]
]
df_proximity_features = df_proximity_features.assign(
prop_prox_near=lambda x: x.bool_prox_near / (x.bool_prox_near + x.bool_prox_far)
)
df_proximity_features = df_proximity_features.rename(
columns={"bool_prox_near": "freq_prox_near"}
).drop(columns="bool_prox_far", inplace=False)
return df_proximity_features

View File

@ -6,7 +6,7 @@
# extension: .py # extension: .py
# format_name: percent # format_name: percent
# format_version: '1.3' # format_version: '1.3'
# jupytext_version: 1.11.4 # jupytext_version: 1.11.2
# kernelspec: # kernelspec:
# display_name: straw2analysis # display_name: straw2analysis
# language: python # language: python
@ -14,12 +14,12 @@
# --- # ---
# %% # %%
# %matplotlib inline
import datetime import datetime
# %%
import os import os
import sys import sys
import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
import seaborn as sns import seaborn as sns
import statsmodels.api as sm import statsmodels.api as sm
@ -31,24 +31,6 @@ if nb_dir not in sys.path:
import participants.query_db import participants.query_db
from features.esm import * from features.esm import *
# %%
SAVE_FIGS = True
FIG_HEIGHT = 5
FIG_ASPECT = 1.7
FIG_COLOUR = "#28827C"
SMALL_SIZE = 14
MEDIUM_SIZE = SMALL_SIZE + 2
BIGGER_SIZE = MEDIUM_SIZE + 2
plt.rc("font", size=SMALL_SIZE) # controls default text sizes
plt.rc("axes", titlesize=SMALL_SIZE) # fontsize of the axes title
plt.rc("axes", labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
plt.rc("xtick", labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc("ytick", labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc("legend", fontsize=SMALL_SIZE) # legend fontsize
plt.rc("figure", titlesize=BIGGER_SIZE) # fontsize of the figure title
# %% # %%
baseline_si = pd.read_csv("E:/STRAWbaseline/results-survey637813.csv") baseline_si = pd.read_csv("E:/STRAWbaseline/results-survey637813.csv")
baseline_be_1 = pd.read_csv("E:/STRAWbaseline/results-survey358134.csv") baseline_be_1 = pd.read_csv("E:/STRAWbaseline/results-survey358134.csv")
@ -148,7 +130,7 @@ df_adherence.describe()
df_adherence[["gender", "startlanguage"]].value_counts() df_adherence[["gender", "startlanguage"]].value_counts()
# %% # %%
sns.displot(df_adherence["finished_sessions"], binwidth=5, height=FIG_HEIGHT) sns.displot(df_adherence["finished_sessions"], binwidth=5, height=5)
# %% # %%
lm_adherence = smf.ols( lm_adherence = smf.ols(
@ -242,14 +224,12 @@ df_session_workday = df_session_workday.assign(
g1 = sns.displot( g1 = sns.displot(
df_session_workday["time_diff_minutes"], df_session_workday["time_diff_minutes"],
binwidth=5, binwidth=5,
height=FIG_HEIGHT, height=5,
aspect=FIG_ASPECT, aspect=1.5,
color=FIG_COLOUR, color="#28827C",
) )
g1.set_axis_labels("Time difference [min]", "Session count") g1.set_axis_labels("Time difference [min]", "Session count")
g1.set(xlim=(0, 570)) # g1.savefig("WorkdayEMAtimeDiff.pdf")
if SAVE_FIGS:
g1.savefig("WorkdayEMAtimeDiff.pdf")
# %% [markdown] # %% [markdown]
# There are some sessions that are really close together. By design, none should be closer than 30 min. Let's take a look at those. # There are some sessions that are really close together. By design, none should be closer than 30 min. Let's take a look at those.
@ -316,13 +296,12 @@ df_mean_daytime_interval.describe()
g2 = sns.displot( g2 = sns.displot(
df_mean_daytime_interval.time_diff_minutes, df_mean_daytime_interval.time_diff_minutes,
binwidth=5, binwidth=5,
height=FIG_HEIGHT, height=5,
aspect=FIG_ASPECT, aspect=1.5,
color=FIG_COLOUR, color="#28827C",
) )
g2.set_axis_labels("Median time difference [min]", "Participant count") g2.set_axis_labels("Median time difference [min]", "Participant count")
if SAVE_FIGS: # g2.savefig("WorkdayEMAtimeDiffMedianParticip.pdf")
g2.savefig("WorkdayEMAtimeDiffMedianParticip.pdf")
# %% # %%
df_adherence = df_adherence.merge( df_adherence = df_adherence.merge(
@ -348,9 +327,9 @@ df_count_daytime_per_participant["time"].describe()
sns.displot( sns.displot(
df_count_daytime_per_participant.time, df_count_daytime_per_participant.time,
binwidth=1, binwidth=1,
height=FIG_HEIGHT, height=5,
aspect=FIG_ASPECT, aspect=1.5,
color=FIG_COLOUR, color="#28827C",
) )
# %% [markdown] # %% [markdown]
@ -385,14 +364,13 @@ s_evening_completed_ratio.describe()
g3 = sns.displot( g3 = sns.displot(
s_evening_completed_ratio - 0.001, s_evening_completed_ratio - 0.001,
binwidth=0.05, binwidth=0.05,
height=FIG_HEIGHT, height=5,
aspect=FIG_ASPECT, aspect=1.5,
color=FIG_COLOUR, color="#28827C",
) )
g3.set_axis_labels("Ratio of days with the evening EMA filled out", "Participant count") g3.set_axis_labels("Ratio of days with the evening EMA filled out", "Participant count")
g3.set(xlim=(1.01, 0.59)) g3.set(xlim=(1.01, 0.59))
if SAVE_FIGS: # g3.savefig("EveningEMAratioParticip.pdf")
g3.savefig("EveningEMAratioParticip.pdf")
# %% # %%
df_adherence = df_adherence.merge( df_adherence = df_adherence.merge(
@ -408,3 +386,5 @@ lr_ols_evening_ratio = smf.ols(
) )
ls_result_evening_ratio = lr_ols_evening_ratio.fit() ls_result_evening_ratio = lr_ols_evening_ratio.fit()
ls_result_evening_ratio.summary() ls_result_evening_ratio.summary()
# %%

View File

@ -16,16 +16,7 @@ class EsmFeatures(unittest.TestCase):
def test_preprocess_esm(self): def test_preprocess_esm(self):
self.esm_processed = preprocess_esm(self.esm) self.esm_processed = preprocess_esm(self.esm)
# Check for columns which should have been extracted from esm_json.
self.assertIn("question_id", self.esm_processed) self.assertIn("question_id", self.esm_processed)
self.assertIn("questionnaire_id", self.esm_processed)
self.assertIn("esm_instructions", self.esm_processed)
self.assertIn("esm_type", self.esm_processed)
self.assertIn("time", self.esm_processed)
# Check for explicitly added column.
self.assertIn("datetime_lj", self.esm_processed)
# All of these keys are referenced in other functions, so they are expected to be present in preprocessed ESM.
# Since all of these are added in a single function, it should be OK to have many assert statements in one test.
def test_classify_sessions_by_completion(self): def test_classify_sessions_by_completion(self):
self.esm_classified_sessions = classify_sessions_by_completion( self.esm_classified_sessions = classify_sessions_by_completion(