169 lines
3.2 KiB
Python
169 lines
3.2 KiB
Python
# ---
|
|
# jupyter:
|
|
# jupytext:
|
|
# formats: ipynb,py:percent
|
|
# text_representation:
|
|
# extension: .py
|
|
# format_name: percent
|
|
# format_version: '1.3'
|
|
# jupytext_version: 1.11.4
|
|
# kernelspec:
|
|
# display_name: straw2analysis
|
|
# language: python
|
|
# name: straw2analysis
|
|
# ---
|
|
|
|
# %%
|
|
# %matplotlib inline
|
|
import os
|
|
import sys
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
# %%
|
|
import seaborn as sns
|
|
|
|
nb_dir = os.path.split(os.getcwd())[0]
|
|
if nb_dir not in sys.path:
|
|
sys.path.append(nb_dir)
|
|
|
|
# %%
|
|
from features.communication import *
|
|
|
|
# %% [markdown]
|
|
# # Example of communication data and feature calculation
|
|
|
|
# %%
|
|
df_calls = get_call_data(["nokia_0000003"])
|
|
print(df_calls)
|
|
|
|
# %%
|
|
count_comms(df_calls)
|
|
|
|
# %%
|
|
df_sms = get_sms_data(["nokia_0000003"])
|
|
count_comms(df_sms)
|
|
|
|
# %% [markdown]
|
|
# # Call data
|
|
|
|
# %%
|
|
import participants.query_db
|
|
|
|
# %%
|
|
participants_inactive_usernames = participants.query_db.get_usernames()
|
|
df_calls_inactive = get_call_data(participants_inactive_usernames)
|
|
|
|
# %%
|
|
participants_inactive_usernames
|
|
|
|
# %%
|
|
df_calls_inactive.head()
|
|
|
|
# %%
|
|
enumerate_contacts(df_calls_inactive).head()
|
|
|
|
# %%
|
|
df_calls_features = count_comms(df_calls_inactive)
|
|
df_calls_features.head()
|
|
|
|
# %%
|
|
df_calls_features.describe()
|
|
|
|
# %%
|
|
calls_number = pd.wide_to_long(
|
|
df_calls_features[["no_incoming", "no_outgoing", "no_missed"]].reset_index(),
|
|
i="participant_id",
|
|
j="call_type",
|
|
stubnames="no",
|
|
sep="_",
|
|
suffix="\D+",
|
|
)
|
|
|
|
# %%
|
|
calls_number
|
|
|
|
# %%
|
|
sns.displot(calls_number, x="no", hue="call_type", binwidth=5, element="step", height=8)
|
|
|
|
# %%
|
|
calls_duration = pd.wide_to_long(
|
|
df_calls_features[
|
|
["duration_total_incoming", "duration_total_outgoing"]
|
|
].reset_index(),
|
|
i="participant_id",
|
|
j="call_type",
|
|
stubnames="duration",
|
|
sep="_",
|
|
suffix="\D+",
|
|
)
|
|
sns.displot(
|
|
calls_duration,
|
|
x="duration",
|
|
hue="call_type",
|
|
multiple="dodge",
|
|
height=8,
|
|
log_scale=(True, False),
|
|
)
|
|
|
|
# %% [markdown]
|
|
# ## Most frequent contacts by participant
|
|
|
|
# %%
|
|
df_calls_inactive = enumerate_contacts(df_calls_inactive)
|
|
df_calls_inactive.tail()
|
|
|
|
# %%
|
|
df_calls_frequent = df_calls_inactive.query("contact_id < 5")
|
|
|
|
# %%
|
|
sns.boxplot(x="contact_id", y="freq", data=df_calls_frequent)
|
|
|
|
# %% [markdown]
|
|
# # SMS data
|
|
|
|
# %%
|
|
df_sms_inactive = get_sms_data(participants_inactive_usernames)
|
|
df_sms_features = count_comms(df_sms_inactive)
|
|
df_sms_features.describe()
|
|
|
|
# %%
|
|
sms_number = pd.wide_to_long(
|
|
df_sms_features[["no_received", "no_sent"]].reset_index(),
|
|
i="participant_id",
|
|
j="message_type",
|
|
stubnames="no",
|
|
sep="_",
|
|
suffix="\D+",
|
|
)
|
|
sns.displot(
|
|
sms_number, x="no", hue="message_type", binwidth=5, element="step", height=8
|
|
)
|
|
|
|
# %% [markdown]
|
|
# # Communication features
|
|
|
|
# %%
|
|
df_calls_enumerated = enumerate_contacts(df_calls)
|
|
display(df_calls_enumerated)
|
|
|
|
# %%
|
|
df_calls_contact_features = contact_features(df_calls_enumerated)
|
|
display(df_calls_contact_features)
|
|
|
|
# %%
|
|
df_sms_enumerated = enumerate_contacts(df_sms)
|
|
df_sms_contact_features = contact_features(df_sms_enumerated)
|
|
display(df_sms_contact_features)
|
|
|
|
# %%
|
|
display(count_comms(df_calls))
|
|
|
|
# %%
|
|
display(count_comms(df_sms))
|
|
|
|
# %%
|
|
display(calls_sms_features(df_calls, df_sms))
|
|
|
|
# %%
|