stress_at_work_analysis/exploration/expl_communication.py

178 lines
3.5 KiB
Python
Raw Permalink Normal View History

# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
2021-08-06 18:44:39 +02:00
# jupytext_version: 1.11.4
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %%
import importlib
# %%
2021-08-06 18:44:39 +02:00
# %matplotlib inline
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
# %%
from features import communication, helper
# %%
importlib.reload(communication)
# %% [markdown]
# # Example of communication data and feature calculation
# %%
df_calls = communication.get_call_data(["nokia_0000003"])
print(df_calls)
# %%
df_calls = helper.get_date_from_timestamp(df_calls)
communication.count_comms(df_calls, ["date_lj"])
# %%
df_sms = communication.get_sms_data(["nokia_0000003"])
df_sms = helper.get_date_from_timestamp(df_sms)
communication.count_comms(df_sms, ["date_lj"])
# %%
communication.calls_sms_features(df_calls, df_sms, ["date_lj"])
# %% [markdown]
# # Call data
# %%
import participants.query_db
# %%
participants_inactive_usernames = participants.query_db.get_usernames()
df_calls_inactive = get_call_data(participants_inactive_usernames)
2021-08-06 18:44:39 +02:00
# %%
participants_inactive_usernames
# %%
df_calls_inactive.head()
# %%
enumerate_contacts(df_calls_inactive).head()
# %%
df_calls_features = count_comms(df_calls_inactive)
df_calls_features.head()
# %%
df_calls_features.describe()
# %%
calls_number = pd.wide_to_long(
2021-07-03 18:45:46 +02:00
df_calls_features[["no_incoming", "no_outgoing", "no_missed"]].reset_index(),
i="participant_id",
j="call_type",
stubnames="no",
sep="_",
suffix="\D+",
)
2021-08-06 18:44:39 +02:00
# %%
calls_number
# %%
sns.displot(calls_number, x="no", hue="call_type", binwidth=5, element="step", height=8)
# %%
calls_duration = pd.wide_to_long(
2021-07-03 18:45:46 +02:00
df_calls_features[
["duration_total_incoming", "duration_total_outgoing"]
].reset_index(),
i="participant_id",
j="call_type",
stubnames="duration",
sep="_",
suffix="\D+",
)
sns.displot(
calls_duration,
x="duration",
hue="call_type",
multiple="dodge",
height=8,
log_scale=(True, False),
)
# %% [markdown]
# ## Most frequent contacts by participant
# %%
df_calls_inactive = enumerate_contacts(df_calls_inactive)
df_calls_inactive.tail()
# %%
df_calls_frequent = df_calls_inactive.query("contact_id < 5")
# %%
sns.boxplot(x="contact_id", y="freq", data=df_calls_frequent)
# %% [markdown]
# # SMS data
# %%
df_sms_inactive = get_sms_data(participants_inactive_usernames)
df_sms_features = count_comms(df_sms_inactive)
df_sms_features.describe()
# %%
sms_number = pd.wide_to_long(
2021-07-03 18:45:46 +02:00
df_sms_features[["no_received", "no_sent"]].reset_index(),
i="participant_id",
j="message_type",
stubnames="no",
sep="_",
suffix="\D+",
)
sns.displot(
sms_number, x="no", hue="message_type", binwidth=5, element="step", height=8
)
2021-08-06 18:44:39 +02:00
# %% [markdown]
# # Communication features
# %%
df_calls_enumerated = enumerate_contacts(df_calls)
display(df_calls_enumerated)
# %%
df_calls_contact_features = contact_features(df_calls_enumerated)
display(df_calls_contact_features)
# %%
df_sms_enumerated = enumerate_contacts(df_sms)
df_sms_contact_features = contact_features(df_sms_enumerated)
display(df_sms_contact_features)
# %%
display(count_comms(df_calls))
# %%
display(count_comms(df_sms))
# %%
display(calls_sms_features(df_calls, df_sms))
# %%