Compare commits
No commits in common. "577a874288e84f5b57c4ae79b89712c67b1285ef" and "1aaf95fe9e3ca9d9bc578d847f1914329f5a27f8" have entirely different histories.
577a874288
...
1aaf95fe9e
|
@ -16,7 +16,6 @@ dependencies:
|
|||
- python-dotenv
|
||||
- pytz
|
||||
- seaborn
|
||||
- scikit-learn
|
||||
- sqlalchemy
|
||||
- statsmodels
|
||||
- tabulate
|
|
@ -1,150 +0,0 @@
|
|||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# formats: ipynb,py:percent
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.11.4
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
# name: straw2analysis
|
||||
# ---
|
||||
|
||||
# %%
|
||||
# %matplotlib inline
|
||||
import datetime
|
||||
import os
|
||||
import sys
|
||||
|
||||
import seaborn as sns
|
||||
from sklearn import linear_model
|
||||
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
|
||||
|
||||
nb_dir = os.path.split(os.getcwd())[0]
|
||||
if nb_dir not in sys.path:
|
||||
sys.path.append(nb_dir)
|
||||
|
||||
# %%
|
||||
import participants.query_db
|
||||
from features import esm, helper, proximity
|
||||
|
||||
# %% [markdown]
|
||||
# # 1. Get the relevant data
|
||||
|
||||
# %%
|
||||
participants_inactive_usernames = participants.query_db.get_usernames(
|
||||
collection_start=datetime.date.fromisoformat("2020-08-01")
|
||||
)
|
||||
# Consider only two participants to simplify.
|
||||
ptcp_2 = participants_inactive_usernames[0:2]
|
||||
|
||||
# %% [markdown]
|
||||
# ## 1.1 Labels
|
||||
|
||||
# %%
|
||||
df_esm = esm.get_esm_data(ptcp_2)
|
||||
df_esm_preprocessed = esm.preprocess_esm(df_esm)
|
||||
|
||||
# %%
|
||||
df_esm_PANAS = df_esm_preprocessed[
|
||||
(df_esm_preprocessed["questionnaire_id"] == 8)
|
||||
| (df_esm_preprocessed["questionnaire_id"] == 9)
|
||||
]
|
||||
df_esm_PANAS_clean = esm.clean_up_esm(df_esm_PANAS)
|
||||
|
||||
# %% [markdown]
|
||||
# ## 1.2 Sensor data
|
||||
|
||||
# %%
|
||||
df_proximity = proximity.get_proximity_data(ptcp_2)
|
||||
df_proximity = helper.get_date_from_timestamp(df_proximity)
|
||||
df_proximity = proximity.recode_proximity(df_proximity)
|
||||
|
||||
# %% [markdown]
|
||||
# ## 1.3 Standardization/personalization
|
||||
|
||||
# %% [markdown]
|
||||
# # 2. Grouping/segmentation
|
||||
|
||||
# %%
|
||||
df_esm_PANAS_daily_means = (
|
||||
df_esm_PANAS_clean.groupby(["participant_id", "date_lj", "questionnaire_id"])
|
||||
.esm_user_answer_numeric.agg("mean")
|
||||
.reset_index()
|
||||
.rename(columns={"esm_user_answer_numeric": "esm_numeric_mean"})
|
||||
)
|
||||
|
||||
# %%
|
||||
df_esm_PANAS_daily_means = (
|
||||
df_esm_PANAS_daily_means.pivot(
|
||||
index=["participant_id", "date_lj"],
|
||||
columns="questionnaire_id",
|
||||
values="esm_numeric_mean",
|
||||
)
|
||||
.reset_index(col_level=1)
|
||||
.rename(columns={8.0: "PA", 9.0: "NA"})
|
||||
.set_index(["participant_id", "date_lj"])
|
||||
)
|
||||
|
||||
|
||||
# %%
|
||||
df_proximity_daily_counts = proximity.count_proximity(
|
||||
df_proximity, ["participant_id", "date_lj"]
|
||||
)
|
||||
|
||||
# %%
|
||||
df_proximity_daily_counts
|
||||
|
||||
# %% [markdown]
|
||||
# # 3. Join features (and export to csv?)
|
||||
|
||||
# %%
|
||||
df_full_data_daily_means = df_esm_PANAS_daily_means.join(
|
||||
df_proximity_daily_counts
|
||||
).reset_index()
|
||||
|
||||
# %% [markdown]
|
||||
# # 4. Machine learning model and parameters
|
||||
|
||||
# %%
|
||||
lin_reg_proximity = linear_model.LinearRegression()
|
||||
|
||||
# %% [markdown]
|
||||
# ## 4.1 Validation method
|
||||
|
||||
# %%
|
||||
logo = LeaveOneGroupOut()
|
||||
logo.get_n_splits(
|
||||
df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
|
||||
df_full_data_daily_means["PA"],
|
||||
groups=df_full_data_daily_means["participant_id"],
|
||||
)
|
||||
|
||||
# %% [markdown]
|
||||
# ## 4.2 Fit results (export?)
|
||||
|
||||
# %%
|
||||
cross_val_score(
|
||||
lin_reg_proximity,
|
||||
df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
|
||||
df_full_data_daily_means["PA"],
|
||||
groups=df_full_data_daily_means["participant_id"],
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring="r2",
|
||||
)
|
||||
|
||||
# %%
|
||||
lin_reg_proximity.fit(
|
||||
df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
|
||||
df_full_data_daily_means["PA"],
|
||||
)
|
||||
|
||||
# %%
|
||||
lin_reg_proximity.score(
|
||||
df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
|
||||
df_full_data_daily_means["PA"],
|
||||
)
|
|
@ -1,76 +0,0 @@
|
|||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# formats: ipynb,py:percent
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.11.4
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
# name: straw2analysis
|
||||
# ---
|
||||
|
||||
# %%
|
||||
# %matplotlib inline
|
||||
import os
|
||||
import sys
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
|
||||
nb_dir = os.path.split(os.getcwd())[0]
|
||||
if nb_dir not in sys.path:
|
||||
sys.path.append(nb_dir)
|
||||
|
||||
# %%
|
||||
from config.models import AppCategories, Participant
|
||||
from setup import db_engine, session
|
||||
|
||||
# %%
|
||||
query_app_categories = session.query(AppCategories)
|
||||
with db_engine.connect() as connection:
|
||||
df_app_categories = pd.read_sql(query_app_categories.statement, connection)
|
||||
|
||||
# %%
|
||||
df_app_categories.head()
|
||||
|
||||
# %%
|
||||
df_app_categories["play_store_genre"].value_counts()
|
||||
|
||||
# %%
|
||||
df_category_not_found = df_app_categories[
|
||||
df_app_categories["play_store_genre"] == "not_found"
|
||||
]
|
||||
|
||||
# %%
|
||||
df_category_not_found["play_store_response"].value_counts()
|
||||
|
||||
# %%
|
||||
df_category_not_found["package_name"].value_counts()
|
||||
|
||||
# %%
|
||||
manufacturers = [
|
||||
"samsung",
|
||||
"oneplus",
|
||||
"huawei",
|
||||
"xiaomi",
|
||||
"lge",
|
||||
"motorola",
|
||||
"miui",
|
||||
"lenovo",
|
||||
"oppo",
|
||||
"mediatek",
|
||||
]
|
||||
custom_rom = ["coloros", "lineageos", "myos", "cyanogenmod", "foundation.e"]
|
||||
other = ["android", "wssyncmldm"]
|
||||
rows_os_manufacturer = df_category_not_found["package_name"].str.contains(
|
||||
"|".join(manufacturers + custom_rom + other), case=False
|
||||
)
|
||||
|
||||
# %%
|
||||
with pd.option_context("display.max_rows", None, "display.max_columns", None):
|
||||
display(df_category_not_found.loc[~rows_os_manufacturer])
|
|
@ -6,7 +6,7 @@
|
|||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.11.4
|
||||
# jupytext_version: 1.11.2
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
|
@ -14,7 +14,6 @@
|
|||
# ---
|
||||
|
||||
# %%
|
||||
# %matplotlib inline
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
@ -54,15 +53,6 @@ import participants.query_db
|
|||
participants_inactive_usernames = participants.query_db.get_usernames()
|
||||
df_calls_inactive = get_call_data(participants_inactive_usernames)
|
||||
|
||||
# %%
|
||||
participants_inactive_usernames
|
||||
|
||||
# %%
|
||||
df_calls_inactive.head()
|
||||
|
||||
# %%
|
||||
enumerate_contacts(df_calls_inactive).head()
|
||||
|
||||
# %%
|
||||
df_calls_features = count_comms(df_calls_inactive)
|
||||
df_calls_features.head()
|
||||
|
@ -80,9 +70,6 @@ calls_number = pd.wide_to_long(
|
|||
suffix="\D+",
|
||||
)
|
||||
|
||||
# %%
|
||||
calls_number
|
||||
|
||||
# %%
|
||||
sns.displot(calls_number, x="no", hue="call_type", binwidth=5, element="step", height=8)
|
||||
|
||||
|
@ -139,30 +126,3 @@ sms_number = pd.wide_to_long(
|
|||
sns.displot(
|
||||
sms_number, x="no", hue="message_type", binwidth=5, element="step", height=8
|
||||
)
|
||||
|
||||
# %% [markdown]
|
||||
# # Communication features
|
||||
|
||||
# %%
|
||||
df_calls_enumerated = enumerate_contacts(df_calls)
|
||||
display(df_calls_enumerated)
|
||||
|
||||
# %%
|
||||
df_calls_contact_features = contact_features(df_calls_enumerated)
|
||||
display(df_calls_contact_features)
|
||||
|
||||
# %%
|
||||
df_sms_enumerated = enumerate_contacts(df_sms)
|
||||
df_sms_contact_features = contact_features(df_sms_enumerated)
|
||||
display(df_sms_contact_features)
|
||||
|
||||
# %%
|
||||
display(count_comms(df_calls))
|
||||
|
||||
# %%
|
||||
display(count_comms(df_sms))
|
||||
|
||||
# %%
|
||||
display(calls_sms_features(df_calls, df_sms))
|
||||
|
||||
# %%
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.11.4
|
||||
# jupytext_version: 1.11.2
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
|
@ -14,7 +14,6 @@
|
|||
# ---
|
||||
|
||||
# %%
|
||||
# %matplotlib inline
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
|
|
@ -86,8 +86,7 @@ def enumerate_contacts(comm_df: pd.DataFrame) -> pd.DataFrame:
|
|||
# In other words, recode the contacts into integers from 0 to n_contacts,
|
||||
# so that the first one is contacted the most often.
|
||||
contact_ids = (
|
||||
# Group again for enumeration.
|
||||
contact_counts.groupby("participant_id")
|
||||
contact_counts.groupby("participant_id") # Group again for enumeration.
|
||||
.cumcount() # Enumerate (count) rows *within* participants.
|
||||
.to_frame("contact_id")
|
||||
)
|
||||
|
@ -177,148 +176,15 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame:
|
|||
return comm_features
|
||||
|
||||
|
||||
def contact_features(df_enumerated: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Counts the number of people contacted (for each participant) and, if
|
||||
df_enumerated is a dataframe containing calls data, the total duration
|
||||
of calls between a participant and each of her contacts.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_enumerated: pd.DataFrame
|
||||
A dataframe of calls or SMSes; return of function enumerate_contacts.
|
||||
|
||||
Returns
|
||||
-------
|
||||
comm_df: pd.DataFrame
|
||||
The altered dataframe with the column no_contacts and, if df_enumerated
|
||||
contains calls data, an additional column total_call_duration.
|
||||
"""
|
||||
|
||||
# Check whether df contains calls or SMS data since some
|
||||
# features we want to calculate are type-specyfic
|
||||
if "call_duration" in df_enumerated:
|
||||
# Add a column with the total duration of calls between two people
|
||||
duration_count = (
|
||||
df_enumerated.groupby(["participant_id", "contact_id"])
|
||||
# For each participant and for each caller, sum durations of their calls
|
||||
["call_duration"]
|
||||
.sum()
|
||||
.reset_index() # Make index (which is actually the participant id) a normal column
|
||||
.rename(columns={"call_duration": "total_call_duration"})
|
||||
)
|
||||
# The new dataframe now contains columns containing information about
|
||||
# participants, callers and the total duration of their calls. All that
|
||||
# is now left to do is to merge the original df with the new one.
|
||||
df_enumerated = df_enumerated.merge(
|
||||
duration_count, on=["participant_id", "contact_id"]
|
||||
)
|
||||
|
||||
contact_count = (
|
||||
df_enumerated.groupby(["participant_id"])
|
||||
.nunique()[
|
||||
"contact_id"
|
||||
] # For each participant, count the number of distinct contacts
|
||||
.reset_index() # Make index (which is actually the participant id) a normal column
|
||||
.rename(columns={"contact_id": "no_contacts"})
|
||||
)
|
||||
|
||||
df_enumerated = (
|
||||
# Merge df with the newely created df containing info about number of contacts
|
||||
df_enumerated.merge(contact_count, on="participant_id")
|
||||
# Sort first by participant_id and then by contact_id and
|
||||
# thereby restore the inital ordering of input dataframes.
|
||||
.sort_values(["participant_id", "contact_id"])
|
||||
)
|
||||
|
||||
# TODO:Determine work vs non-work contacts by work hours heuristics
|
||||
|
||||
return df_enumerated
|
||||
def contact_features():
|
||||
# TODO Implement a method that takes a DF with enumerated contacts as argument and calculates:
|
||||
# * Duration of calls per caller (for most common callers)
|
||||
# * Determine work vs non-work contacts by work hours heuristics
|
||||
# * Number of people contacted
|
||||
# And similarly for SMS.
|
||||
pass
|
||||
|
||||
|
||||
def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Calculates additional features relating calls and sms data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_calls: pd.DataFrame
|
||||
A dataframe of calls (return of get_call_data).
|
||||
df_sms: pd.DataFrame
|
||||
A dataframe of calls (return of get_sms_data).
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_calls_sms: pd.DataFrame
|
||||
The list of features relating calls and sms data for every participant.
|
||||
These are:
|
||||
* proportion_calls:
|
||||
proportion of calls in total number of communications
|
||||
* proportion_calls_incoming:
|
||||
proportion of incoming calls in total number of incoming/received communications
|
||||
* proportion_calls_outgoing:
|
||||
proportion of outgoing calls in total number of outgoing/sent communications
|
||||
* proportion_calls_missed_sms_received:
|
||||
proportion of missed calls to the number of received messages
|
||||
* proportion_calls_contacts:
|
||||
proportion of calls contacts in total number of communication contacts
|
||||
"""
|
||||
|
||||
count_calls = count_comms(df_calls)
|
||||
count_sms = count_comms(df_sms)
|
||||
|
||||
count_joined = (
|
||||
count_calls.merge(
|
||||
count_sms, on="participant_id", suffixes=("_calls", "_sms")
|
||||
) # Merge calls and sms features
|
||||
.reset_index() # Make participant_id a regular column
|
||||
.assign(
|
||||
proportion_calls=(
|
||||
lambda x: x.no_all_calls / (x.no_all_calls + x.no_all_sms)
|
||||
),
|
||||
proportion_calls_incoming=(
|
||||
lambda x: x.no_incoming / (x.no_incoming + x.no_received)
|
||||
),
|
||||
proportion_calls_missed_sms_received=(
|
||||
lambda x: x.no_missed / (x.no_missed + x.no_received)
|
||||
),
|
||||
proportion_calls_outgoing=(
|
||||
lambda x: x.no_outgoing / (x.no_outgoing + x.no_sent)
|
||||
)
|
||||
# Calculate new features and create additional columns
|
||||
)[
|
||||
[
|
||||
"participant_id",
|
||||
"proportion_calls",
|
||||
"proportion_calls_incoming",
|
||||
"proportion_calls_outgoing",
|
||||
"proportion_calls_missed_sms_received",
|
||||
]
|
||||
] # Filter out only the relevant features
|
||||
)
|
||||
|
||||
features_calls = contact_features(enumerate_contacts(df_calls))
|
||||
features_sms = contact_features(enumerate_contacts(df_sms))
|
||||
|
||||
features_joined = (
|
||||
features_calls.merge(
|
||||
features_sms, on="participant_id", suffixes=("_calls", "_sms")
|
||||
) # Merge calls and sms features
|
||||
.reset_index() # Make participant_id a regular column
|
||||
.assign(
|
||||
proportion_calls_contacts=(
|
||||
lambda x: x.no_contacts_calls
|
||||
/ (x.no_contacts_calls + x.no_contacts_sms)
|
||||
) # Calculate new features and create additional columns
|
||||
)[
|
||||
["participant_id", "proportion_calls_contacts"]
|
||||
] # Filter out only the relevant features
|
||||
# Since we are interested only in some features and ignored
|
||||
# others, a lot of duplicate rows were created. Remove them.
|
||||
.drop_duplicates()
|
||||
)
|
||||
|
||||
# Join the newly created dataframes
|
||||
df_calls_sms = count_joined.merge(features_joined, on="participant_id")
|
||||
|
||||
return df_calls_sms
|
||||
def calls_sms_features():
|
||||
# TODO Relate the calls and sms data, such as comparing the number of (missed) calls and messages.
|
||||
pass
|
||||
|
|
|
@ -1,12 +1,14 @@
|
|||
import datetime
|
||||
from collections.abc import Collection
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pytz import timezone
|
||||
|
||||
from config.models import ESM, Participant
|
||||
from features import helper
|
||||
from setup import db_engine, session
|
||||
|
||||
TZ_LJ = timezone("Europe/Ljubljana")
|
||||
ESM_STATUS_ANSWERED = 2
|
||||
|
||||
GROUP_SESSIONS_BY = ["participant_id", "device_id", "esm_session"]
|
||||
|
@ -65,8 +67,14 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
|
|||
df_esm_preprocessed: pd.DataFrame
|
||||
A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
|
||||
"""
|
||||
df_esm = helper.get_date_from_timestamp(df_esm)
|
||||
|
||||
df_esm["datetime_lj"] = df_esm["double_esm_user_answer_timestamp"].apply(
|
||||
lambda x: datetime.datetime.fromtimestamp(x / 1000.0, tz=TZ_LJ)
|
||||
)
|
||||
df_esm = df_esm.assign(
|
||||
date_lj=lambda x: (x.datetime_lj - datetime.timedelta(hours=4)).dt.date
|
||||
)
|
||||
# Since daytime EMAs could *theoretically* last beyond midnight, but never after 4 AM,
|
||||
# the datetime is first translated to 4 h earlier.
|
||||
df_esm_json = pd.json_normalize(df_esm["esm_json"]).drop(
|
||||
columns=["esm_trigger"]
|
||||
) # The esm_trigger column is already present in the main df.
|
||||
|
@ -248,9 +256,9 @@ def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
|||
ESM.ESM_TYPE.get("scale"),
|
||||
ESM.ESM_TYPE.get("number"),
|
||||
]
|
||||
df_esm_clean.loc[
|
||||
df_esm_clean[df_esm_clean["esm_type"].isin(esm_type_numeric)] = df_esm_clean[
|
||||
df_esm_clean["esm_type"].isin(esm_type_numeric)
|
||||
] = df_esm_clean.loc[df_esm_clean["esm_type"].isin(esm_type_numeric)].assign(
|
||||
].assign(
|
||||
esm_user_answer_numeric=lambda x: x.esm_user_answer.str.slice(stop=1).astype(
|
||||
int
|
||||
)
|
||||
|
|
|
@ -1,267 +0,0 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import features.esm
|
||||
|
||||
QUESTIONNAIRE_ID_SAM = {
|
||||
"event_stress": 87,
|
||||
"event_threat": 88,
|
||||
"event_challenge": 89,
|
||||
"event_time": 90,
|
||||
"event_duration": 91,
|
||||
"event_work_related": 92,
|
||||
"period_stress": 93,
|
||||
}
|
||||
QUESTIONNAIRE_ID_SAM_LOW = min(QUESTIONNAIRE_ID_SAM.values())
|
||||
QUESTIONNAIRE_ID_SAM_HIGH = max(QUESTIONNAIRE_ID_SAM.values())
|
||||
|
||||
GROUP_QUESTIONNAIRES_BY = [
|
||||
"participant_id",
|
||||
"device_id",
|
||||
"esm_session",
|
||||
]
|
||||
# Each questionnaire occurs only once within each esm_session on the same device within the same participant.
|
||||
|
||||
|
||||
def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:
|
||||
# 0. Select only questions from Stress Appraisal Measure.
|
||||
df_esm_preprocessed = features.esm.preprocess_esm(df_esm)
|
||||
df_esm_sam = df_esm_preprocessed[
|
||||
(df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_ID_SAM_LOW)
|
||||
& (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_ID_SAM_HIGH)
|
||||
]
|
||||
|
||||
df_esm_sam_clean = features.esm.clean_up_esm(df_esm_sam)
|
||||
# 1.
|
||||
df_esm_event_threat_challenge_mean_wide = calculate_threat_challenge_means(
|
||||
df_esm_sam_clean
|
||||
)
|
||||
# 2.
|
||||
df_esm_event_stress = detect_stressful_event(df_esm_sam_clean)
|
||||
|
||||
# Join to the previously calculated features related to the events.
|
||||
df_esm_events = df_esm_event_threat_challenge_mean_wide.join(
|
||||
df_esm_event_stress[
|
||||
GROUP_QUESTIONNAIRES_BY + ["event_present", "event_stressfulness"]
|
||||
].set_index(GROUP_QUESTIONNAIRES_BY)
|
||||
)
|
||||
|
||||
# 3.
|
||||
df_esm_event_work_related = detect_event_work_related(df_esm_sam_clean)
|
||||
|
||||
df_esm_events = df_esm_events.join(
|
||||
df_esm_event_work_related[
|
||||
GROUP_QUESTIONNAIRES_BY + ["event_work_related"]
|
||||
].set_index(GROUP_QUESTIONNAIRES_BY)
|
||||
)
|
||||
|
||||
# 4.
|
||||
df_esm_event_time = convert_event_time(df_esm_sam_clean)
|
||||
|
||||
df_esm_events = df_esm_events.join(
|
||||
df_esm_event_time[GROUP_QUESTIONNAIRES_BY + ["event_time"]].set_index(
|
||||
GROUP_QUESTIONNAIRES_BY
|
||||
)
|
||||
)
|
||||
|
||||
# 5.
|
||||
df_esm_event_duration = extract_event_duration(df_esm_sam_clean)
|
||||
|
||||
df_esm_events = df_esm_events.join(
|
||||
df_esm_event_duration[
|
||||
GROUP_QUESTIONNAIRES_BY + ["event_duration", "event_duration_info"]
|
||||
].set_index(GROUP_QUESTIONNAIRES_BY)
|
||||
)
|
||||
|
||||
return df_esm_events
|
||||
|
||||
|
||||
def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
This function calculates challenge and threat (two Stress Appraisal Measure subscales) means,
|
||||
for each ESM session (within participants and devices).
|
||||
It creates a grouped dataframe with means in two columns.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_esm_sam_clean: pd.DataFrame
|
||||
A cleaned up dataframe of Stress Appraisal Measure items.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_esm_event_threat_challenge_mean_wide: pd.DataFrame
|
||||
A dataframe of unique ESM sessions (by participants and devices) with threat and challenge means.
|
||||
"""
|
||||
# Select only threat and challenge assessments for events
|
||||
df_esm_event_threat_challenge = df_esm_sam_clean[
|
||||
(
|
||||
df_esm_sam_clean["questionnaire_id"]
|
||||
== QUESTIONNAIRE_ID_SAM.get("event_threat")
|
||||
)
|
||||
| (
|
||||
df_esm_sam_clean["questionnaire_id"]
|
||||
== QUESTIONNAIRE_ID_SAM.get("event_challenge")
|
||||
)
|
||||
]
|
||||
# Calculate mean of threat and challenge subscales for each ESM session.
|
||||
df_esm_event_threat_challenge_mean_wide = pd.pivot_table(
|
||||
df_esm_event_threat_challenge,
|
||||
index=["participant_id", "device_id", "esm_session"],
|
||||
columns=["questionnaire_id"],
|
||||
values=["esm_user_answer_numeric"],
|
||||
aggfunc="mean",
|
||||
)
|
||||
# Drop unnecessary column values.
|
||||
df_esm_event_threat_challenge_mean_wide.columns = df_esm_event_threat_challenge_mean_wide.columns.get_level_values(
|
||||
1
|
||||
)
|
||||
df_esm_event_threat_challenge_mean_wide.columns.name = None
|
||||
df_esm_event_threat_challenge_mean_wide.rename(
|
||||
columns={
|
||||
QUESTIONNAIRE_ID_SAM.get("event_threat"): "threat_mean",
|
||||
QUESTIONNAIRE_ID_SAM.get("event_challenge"): "challenge_mean",
|
||||
},
|
||||
inplace=True,
|
||||
)
|
||||
return df_esm_event_threat_challenge_mean_wide
|
||||
|
||||
|
||||
def detect_stressful_event(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Participants were asked: "Was there a particular event that created tension in you?"
|
||||
The following options were available:
|
||||
0 - No,
|
||||
1 - Yes, slightly,
|
||||
2 - Yes, moderately,
|
||||
3 - Yes, considerably,
|
||||
4 - Yes, extremely.
|
||||
This function indicates whether there was a stressful event (True/False)
|
||||
and how stressful it was on a scale of 1 to 4.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_esm_sam_clean: pd.DataFrame
|
||||
A cleaned up dataframe of Stress Appraisal Measure items.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_esm_event_stress: pd.DataFrame
|
||||
The same dataframe with two new columns:
|
||||
- event_present, indicating whether there was a stressful event at all,
|
||||
- event_stressfulness, a numeric answer (1-4) to the single item question.
|
||||
|
||||
"""
|
||||
df_esm_event_stress = df_esm_sam_clean[
|
||||
df_esm_sam_clean["questionnaire_id"] == QUESTIONNAIRE_ID_SAM.get("event_stress")
|
||||
]
|
||||
df_esm_event_stress = df_esm_event_stress.assign(
|
||||
event_present=lambda x: x.esm_user_answer_numeric > 0,
|
||||
event_stressfulness=lambda x: x.esm_user_answer_numeric,
|
||||
)
|
||||
return df_esm_event_stress
|
||||
|
||||
|
||||
def detect_event_work_related(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
This function simply adds a column indicating the answer to the question:
|
||||
"Was/is this event work-related?"
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_esm_sam_clean: pd.DataFrame
|
||||
A cleaned up dataframe of Stress Appraisal Measure items.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_esm_event_stress: pd.DataFrame
|
||||
The same dataframe with a new column event_work_related (True/False).
|
||||
|
||||
"""
|
||||
df_esm_event_stress = df_esm_sam_clean[
|
||||
df_esm_sam_clean["questionnaire_id"]
|
||||
== QUESTIONNAIRE_ID_SAM.get("event_work_related")
|
||||
]
|
||||
df_esm_event_stress = df_esm_event_stress.assign(
|
||||
event_work_related=lambda x: x.esm_user_answer_numeric > 0
|
||||
)
|
||||
return df_esm_event_stress
|
||||
|
||||
|
||||
def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
This function only serves to convert the string datetime answer into a real datetime type.
|
||||
Errors during this conversion are coerced, meaning that non-datetime answers are assigned Not a Time (NaT).
|
||||
NOTE: Since the only available non-datetime answer to this question was "0 - I do not remember",
|
||||
the NaTs can be interpreted to mean this.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_esm_sam_clean: pd.DataFrame
|
||||
A cleaned up dataframe of Stress Appraisal Measure items.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_esm_event_time: pd.DataFrame
|
||||
The same dataframe with a new column event_time of datetime type.
|
||||
"""
|
||||
df_esm_event_time = df_esm_sam_clean[
|
||||
df_esm_sam_clean["questionnaire_id"] == QUESTIONNAIRE_ID_SAM.get("event_time")
|
||||
].assign(
|
||||
event_time=lambda x: pd.to_datetime(
|
||||
x.esm_user_answer, errors="coerce", infer_datetime_format=True, exact=True
|
||||
)
|
||||
)
|
||||
return df_esm_event_time
|
||||
|
||||
|
||||
def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
If participants indicated a stressful events, they were asked:
|
||||
"How long did this event last? (Answer in hours and minutes)"
|
||||
This function extracts this duration time and saves additional answers:
|
||||
0 - I do not remember,
|
||||
1 - It is still going on.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_esm_sam_clean: pd.DataFrame
|
||||
A cleaned up dataframe of Stress Appraisal Measure items.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_esm_event_duration: pd.DataFrame
|
||||
The same dataframe with two new columns:
|
||||
- event_duration, a time part of a datetime,
|
||||
- event_duration_info, giving other options to this question:
|
||||
0 - I do not remember,
|
||||
1 - It is still going on
|
||||
"""
|
||||
df_esm_event_duration = df_esm_sam_clean[
|
||||
df_esm_sam_clean["questionnaire_id"]
|
||||
== QUESTIONNAIRE_ID_SAM.get("event_duration")
|
||||
].assign(
|
||||
event_duration=lambda x: pd.to_datetime(
|
||||
x.esm_user_answer.str.slice(start=0, stop=-6), errors="coerce"
|
||||
).dt.time
|
||||
)
|
||||
# TODO Explore the values recorded in event_duration and possibly fix mistakes.
|
||||
# For example, participants reported setting 23:50:00 instead of 00:50:00.
|
||||
|
||||
# For the events that no duration was found (i.e. event_duration = NaT),
|
||||
# we can determine whether:
|
||||
# - this event is still going on ("1 - It is still going on")
|
||||
# - the participant couldn't remember it's duration ("0 - I do not remember")
|
||||
# Generally, these answers were converted to esm_user_answer_numeric in clean_up_esm,
|
||||
# but only the numeric types of questions and answers.
|
||||
# Since this was of "datetime" type, convert these specific answers here again.
|
||||
df_esm_event_duration["event_duration_info"] = np.nan
|
||||
df_esm_event_duration[
|
||||
df_esm_event_duration.event_duration.isna()
|
||||
] = df_esm_event_duration[df_esm_event_duration.event_duration.isna()].assign(
|
||||
event_duration_info=lambda x: x.esm_user_answer.str.slice(stop=1).astype(int)
|
||||
)
|
||||
|
||||
return df_esm_event_duration
|
||||
|
||||
|
||||
# TODO: How many questions about the stressfulness of the period were asked and how does this relate to events?
|
|
@ -1,41 +0,0 @@
|
|||
import datetime
|
||||
|
||||
import pandas as pd
|
||||
from pytz import timezone
|
||||
|
||||
TZ_LJ = timezone("Europe/Ljubljana")
|
||||
COLUMN_TIMESTAMP = "timestamp"
|
||||
COLUMN_TIMESTAMP_ESM = "double_esm_user_answer_timestamp"
|
||||
|
||||
|
||||
def get_date_from_timestamp(df_aware) -> pd.DataFrame:
|
||||
"""
|
||||
Transform a UNIX timestamp into a datetime (with Ljubljana timezone).
|
||||
Additionally, extract only the date part, where anything until 4 AM is considered the same day.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_aware: pd.DataFrame
|
||||
Any AWARE-type data as defined in models.py.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_aware: pd.DataFrame
|
||||
The same dataframe with datetime_lj and date_lj columns added.
|
||||
|
||||
"""
|
||||
if COLUMN_TIMESTAMP_ESM in df_aware:
|
||||
column_timestamp = COLUMN_TIMESTAMP_ESM
|
||||
else:
|
||||
column_timestamp = COLUMN_TIMESTAMP
|
||||
|
||||
df_aware["datetime_lj"] = df_aware[column_timestamp].apply(
|
||||
lambda x: datetime.datetime.fromtimestamp(x / 1000.0, tz=TZ_LJ)
|
||||
)
|
||||
df_aware = df_aware.assign(
|
||||
date_lj=lambda x: (x.datetime_lj - datetime.timedelta(hours=4)).dt.date
|
||||
)
|
||||
# Since daytime EMAs could *theoretically* last beyond midnight, but never after 4 AM,
|
||||
# the datetime is first translated to 4 h earlier.
|
||||
|
||||
return df_aware
|
|
@ -28,63 +28,3 @@ def get_proximity_data(usernames: Collection) -> pd.DataFrame:
|
|||
with db_engine.connect() as connection:
|
||||
df_proximity = pd.read_sql(query_proximity.statement, connection)
|
||||
return df_proximity
|
||||
|
||||
|
||||
def recode_proximity(df_proximity: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
This function recodes proximity from a double to a boolean value.
|
||||
Different proximity sensors report different values,
|
||||
but in our data only several distinct values have ever been found.
|
||||
These are therefore converted into "near" and "far" binary values.
|
||||
See expl_proximity.ipynb for additional info.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_proximity: pd.DataFrame
|
||||
A dataframe of proximity data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_proximity: pd.DataFrame
|
||||
The same dataframe with an additional column bool_prox_near,
|
||||
indicating whether "near" proximity was reported.
|
||||
False values correspond to "far" reported by this sensor.
|
||||
|
||||
"""
|
||||
df_proximity = df_proximity.assign(bool_prox_near=lambda x: x.double_proximity == 0)
|
||||
return df_proximity
|
||||
|
||||
|
||||
def count_proximity(
|
||||
df_proximity: pd.DataFrame, group_by: Collection = ["participant_id"]
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
The function counts how many times a "near" value occurs in proximity
|
||||
and calculates the proportion of this counts to all proximity values (i.e. relative count).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_proximity: pd.DataFrame
|
||||
A dataframe of proximity data.
|
||||
group_by: Collection
|
||||
A list of strings, specifying by which parameters to group.
|
||||
By default, the features are calculated per participant, but could be "date_lj" etc.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_proximity_features: pd.DataFrame
|
||||
A dataframe with the count of "near" proximity values and their relative count.
|
||||
"""
|
||||
if "bool_prox_near" not in df_proximity:
|
||||
df_proximity = recode_proximity(df_proximity)
|
||||
df_proximity["bool_prox_far"] = ~df_proximity["bool_prox_near"]
|
||||
df_proximity_features = df_proximity.groupby(group_by).sum()[
|
||||
["bool_prox_near", "bool_prox_far"]
|
||||
]
|
||||
df_proximity_features = df_proximity_features.assign(
|
||||
prop_prox_near=lambda x: x.bool_prox_near / (x.bool_prox_near + x.bool_prox_far)
|
||||
)
|
||||
df_proximity_features = df_proximity_features.rename(
|
||||
columns={"bool_prox_near": "freq_prox_near"}
|
||||
).drop(columns="bool_prox_far", inplace=False)
|
||||
return df_proximity_features
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.11.4
|
||||
# jupytext_version: 1.11.2
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
|
@ -14,12 +14,12 @@
|
|||
# ---
|
||||
|
||||
# %%
|
||||
# %matplotlib inline
|
||||
import datetime
|
||||
|
||||
# %%
|
||||
import os
|
||||
import sys
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import statsmodels.api as sm
|
||||
|
@ -31,24 +31,6 @@ if nb_dir not in sys.path:
|
|||
import participants.query_db
|
||||
from features.esm import *
|
||||
|
||||
# %%
|
||||
SAVE_FIGS = True
|
||||
FIG_HEIGHT = 5
|
||||
FIG_ASPECT = 1.7
|
||||
FIG_COLOUR = "#28827C"
|
||||
|
||||
SMALL_SIZE = 14
|
||||
MEDIUM_SIZE = SMALL_SIZE + 2
|
||||
BIGGER_SIZE = MEDIUM_SIZE + 2
|
||||
|
||||
plt.rc("font", size=SMALL_SIZE) # controls default text sizes
|
||||
plt.rc("axes", titlesize=SMALL_SIZE) # fontsize of the axes title
|
||||
plt.rc("axes", labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
|
||||
plt.rc("xtick", labelsize=SMALL_SIZE) # fontsize of the tick labels
|
||||
plt.rc("ytick", labelsize=SMALL_SIZE) # fontsize of the tick labels
|
||||
plt.rc("legend", fontsize=SMALL_SIZE) # legend fontsize
|
||||
plt.rc("figure", titlesize=BIGGER_SIZE) # fontsize of the figure title
|
||||
|
||||
# %%
|
||||
baseline_si = pd.read_csv("E:/STRAWbaseline/results-survey637813.csv")
|
||||
baseline_be_1 = pd.read_csv("E:/STRAWbaseline/results-survey358134.csv")
|
||||
|
@ -148,7 +130,7 @@ df_adherence.describe()
|
|||
df_adherence[["gender", "startlanguage"]].value_counts()
|
||||
|
||||
# %%
|
||||
sns.displot(df_adherence["finished_sessions"], binwidth=5, height=FIG_HEIGHT)
|
||||
sns.displot(df_adherence["finished_sessions"], binwidth=5, height=5)
|
||||
|
||||
# %%
|
||||
lm_adherence = smf.ols(
|
||||
|
@ -242,14 +224,12 @@ df_session_workday = df_session_workday.assign(
|
|||
g1 = sns.displot(
|
||||
df_session_workday["time_diff_minutes"],
|
||||
binwidth=5,
|
||||
height=FIG_HEIGHT,
|
||||
aspect=FIG_ASPECT,
|
||||
color=FIG_COLOUR,
|
||||
height=5,
|
||||
aspect=1.5,
|
||||
color="#28827C",
|
||||
)
|
||||
g1.set_axis_labels("Time difference [min]", "Session count")
|
||||
g1.set(xlim=(0, 570))
|
||||
if SAVE_FIGS:
|
||||
g1.savefig("WorkdayEMAtimeDiff.pdf")
|
||||
# g1.savefig("WorkdayEMAtimeDiff.pdf")
|
||||
|
||||
# %% [markdown]
|
||||
# There are some sessions that are really close together. By design, none should be closer than 30 min. Let's take a look at those.
|
||||
|
@ -316,13 +296,12 @@ df_mean_daytime_interval.describe()
|
|||
g2 = sns.displot(
|
||||
df_mean_daytime_interval.time_diff_minutes,
|
||||
binwidth=5,
|
||||
height=FIG_HEIGHT,
|
||||
aspect=FIG_ASPECT,
|
||||
color=FIG_COLOUR,
|
||||
height=5,
|
||||
aspect=1.5,
|
||||
color="#28827C",
|
||||
)
|
||||
g2.set_axis_labels("Median time difference [min]", "Participant count")
|
||||
if SAVE_FIGS:
|
||||
g2.savefig("WorkdayEMAtimeDiffMedianParticip.pdf")
|
||||
# g2.savefig("WorkdayEMAtimeDiffMedianParticip.pdf")
|
||||
|
||||
# %%
|
||||
df_adherence = df_adherence.merge(
|
||||
|
@ -348,9 +327,9 @@ df_count_daytime_per_participant["time"].describe()
|
|||
sns.displot(
|
||||
df_count_daytime_per_participant.time,
|
||||
binwidth=1,
|
||||
height=FIG_HEIGHT,
|
||||
aspect=FIG_ASPECT,
|
||||
color=FIG_COLOUR,
|
||||
height=5,
|
||||
aspect=1.5,
|
||||
color="#28827C",
|
||||
)
|
||||
|
||||
# %% [markdown]
|
||||
|
@ -385,14 +364,13 @@ s_evening_completed_ratio.describe()
|
|||
g3 = sns.displot(
|
||||
s_evening_completed_ratio - 0.001,
|
||||
binwidth=0.05,
|
||||
height=FIG_HEIGHT,
|
||||
aspect=FIG_ASPECT,
|
||||
color=FIG_COLOUR,
|
||||
height=5,
|
||||
aspect=1.5,
|
||||
color="#28827C",
|
||||
)
|
||||
g3.set_axis_labels("Ratio of days with the evening EMA filled out", "Participant count")
|
||||
g3.set(xlim=(1.01, 0.59))
|
||||
if SAVE_FIGS:
|
||||
g3.savefig("EveningEMAratioParticip.pdf")
|
||||
# g3.savefig("EveningEMAratioParticip.pdf")
|
||||
|
||||
# %%
|
||||
df_adherence = df_adherence.merge(
|
||||
|
@ -408,3 +386,5 @@ lr_ols_evening_ratio = smf.ols(
|
|||
)
|
||||
ls_result_evening_ratio = lr_ols_evening_ratio.fit()
|
||||
ls_result_evening_ratio.summary()
|
||||
|
||||
# %%
|
||||
|
|
|
@ -16,16 +16,7 @@ class EsmFeatures(unittest.TestCase):
|
|||
|
||||
def test_preprocess_esm(self):
|
||||
self.esm_processed = preprocess_esm(self.esm)
|
||||
# Check for columns which should have been extracted from esm_json.
|
||||
self.assertIn("question_id", self.esm_processed)
|
||||
self.assertIn("questionnaire_id", self.esm_processed)
|
||||
self.assertIn("esm_instructions", self.esm_processed)
|
||||
self.assertIn("esm_type", self.esm_processed)
|
||||
self.assertIn("time", self.esm_processed)
|
||||
# Check for explicitly added column.
|
||||
self.assertIn("datetime_lj", self.esm_processed)
|
||||
# All of these keys are referenced in other functions, so they are expected to be present in preprocessed ESM.
|
||||
# Since all of these are added in a single function, it should be OK to have many assert statements in one test.
|
||||
|
||||
def test_classify_sessions_by_completion(self):
|
||||
self.esm_classified_sessions = classify_sessions_by_completion(
|
||||
|
|
Loading…
Reference in New Issue