stress_at_work_analysis/exploration/expl_communication.py

# ---
# jupyter:
#   jupytext:
#     formats: ipynb,py:percent
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.11.4
#   kernelspec:
#     display_name: straw2analysis
#     language: python
#     name: straw2analysis
# ---

# %%
# %matplotlib inline
import os
import sys

import matplotlib.pyplot as plt

# %%
import seaborn as sns

nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

# %%
from features.communication import *

# %% [markdown]
# # Example of communication data and feature calculation

# %%
df_calls = get_call_data(["nokia_0000003"])
print(df_calls)

# %%
count_comms(df_calls)

# %%
df_sms = get_sms_data(["nokia_0000003"])
count_comms(df_sms)

# %% [markdown]
# # Call data

# %%
import participants.query_db

# %%
participants_inactive_usernames = participants.query_db.get_usernames()
df_calls_inactive = get_call_data(participants_inactive_usernames)

# %%
participants_inactive_usernames

# %%
df_calls_inactive.head()

# %%
enumerate_contacts(df_calls_inactive).head()

# %%
df_calls_features = count_comms(df_calls_inactive)
df_calls_features.head()

# %%
df_calls_features.describe()

# %%
calls_number = pd.wide_to_long(
    df_calls_features[["no_incoming", "no_outgoing", "no_missed"]].reset_index(),
    i="participant_id",
    j="call_type",
    stubnames="no",
    sep="_",
    suffix="\D+",
)

# %%
calls_number

# %%
sns.displot(calls_number, x="no", hue="call_type", binwidth=5, element="step", height=8)

# %%
calls_duration = pd.wide_to_long(
    df_calls_features[
        ["duration_total_incoming", "duration_total_outgoing"]
    ].reset_index(),
    i="participant_id",
    j="call_type",
    stubnames="duration",
    sep="_",
    suffix="\D+",
)
sns.displot(
    calls_duration,
    x="duration",
    hue="call_type",
    multiple="dodge",
    height=8,
    log_scale=(True, False),
)

# %% [markdown]
# ## Most frequent contacts by participant

# %%
df_calls_inactive = enumerate_contacts(df_calls_inactive)
df_calls_inactive.tail()

# %%
df_calls_frequent = df_calls_inactive.query("contact_id < 5")

# %%
sns.boxplot(x="contact_id", y="freq", data=df_calls_frequent)

# %% [markdown]
# # SMS data

# %%
df_sms_inactive = get_sms_data(participants_inactive_usernames)
df_sms_features = count_comms(df_sms_inactive)
df_sms_features.describe()

# %%
sms_number = pd.wide_to_long(
    df_sms_features[["no_received", "no_sent"]].reset_index(),
    i="participant_id",
    j="message_type",
    stubnames="no",
    sep="_",
    suffix="\D+",
)
sns.displot(
    sms_number, x="no", hue="message_type", binwidth=5, element="step", height=8
)

# %% [markdown]
# # Communication features

# %%
df_calls_enumerated = enumerate_contacts(df_calls)
display(df_calls_enumerated)

# %%
df_calls_contact_features = contact_features(df_calls_enumerated)
display(df_calls_contact_features)

# %%
df_sms_enumerated = enumerate_contacts(df_sms)
df_sms_contact_features = contact_features(df_sms_enumerated)
display(df_sms_contact_features)

# %%
display(count_comms(df_calls))

# %%
display(count_comms(df_sms))

# %%
display(calls_sms_features(df_calls, df_sms))

# %%