Add SMS data exploration and use Jupytext to save JupyterNotebooks as py scripts.

communication
junos 2021-05-07 12:10:46 +02:00
parent 056db73786
commit 10bdc8aa1d
7 changed files with 347 additions and 14 deletions

View File

@ -2,6 +2,6 @@
<project version="4"> <project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (straw2analysis)" project-jdk-type="Python SDK" /> <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (straw2analysis)" project-jdk-type="Python SDK" />
<component name="PyCharmProfessionalAdvertiser"> <component name="PyCharmProfessionalAdvertiser">
<option name="shown" value="false" /> <option name="shown" value="true" />
</component> </component>
</project> </project>

View File

@ -8,7 +8,9 @@ dependencies:
- isort - isort
- flake8 - flake8
- jupyterlab - jupyterlab
- jupytext
- mypy - mypy
- nodejs
- pandas - pandas
- psycopg2 - psycopg2
- python-dotenv - python-dotenv

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,126 @@
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.11.2
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %%
import os
import sys
import matplotlib.pyplot as plt
# %%
import seaborn as sns
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
# %%
from features.communication import *
# %% [markdown]
# # Example of communication data and feature calculation
# %%
df_calls = get_call_data(["nokia_0000003"])
print(df_calls)
# %%
count_comms(df_calls)
# %%
df_sms = get_sms_data(["nokia_0000003"])
count_comms(df_sms)
# %% [markdown]
# # Call data
# %%
import participants.query_db
# %%
participants_inactive_usernames = participants.query_db.get_usernames()
df_calls_inactive = get_call_data(participants_inactive_usernames)
# %%
df_calls_features = count_comms(df_calls_inactive)
df_calls_features.head()
# %%
df_calls_features.describe()
# %%
calls_number = pd.wide_to_long(
df_calls_features.reset_index(),
i="participant_id",
j="call_type",
stubnames="no",
sep="_",
suffix="\D+",
)
# %%
sns.displot(calls_number, x="no", hue="call_type", binwidth=5, element="step", height=8)
# %%
calls_duration = pd.wide_to_long(
df_calls_features.reset_index(),
i="participant_id",
j="call_type",
stubnames="duration",
sep="_",
suffix="\D+",
)
sns.displot(
calls_duration,
x="duration",
hue="call_type",
multiple="dodge",
height=8,
log_scale=(True, False),
)
# %% [markdown]
# ## Most frequent contacts by participant
# %%
df_calls_inactive = enumerate_contacts(df_calls_inactive)
df_calls_inactive.tail()
# %%
df_calls_frequent = df_calls_inactive.query("contact_id < 5")
# %%
sns.boxplot(x="contact_id", y="freq", data=df_calls_frequent)
# %% [markdown]
# # SMS data
# %%
df_sms_inactive = get_sms_data(participants_inactive_usernames)
df_sms_features = count_comms(df_sms_inactive)
df_sms_features.describe()
# %%
sms_number = pd.wide_to_long(
df_sms_features.reset_index(),
i="participant_id",
j="message_type",
stubnames="no",
sep="_",
suffix="\D+",
)
sns.displot(
sms_number, x="no", hue="message_type", binwidth=5, element="step", height=8
)

View File

@ -231,6 +231,9 @@
} }
], ],
"metadata": { "metadata": {
"jupytext": {
"formats": "ipynb,auto:percent"
},
"kernelspec": { "kernelspec": {
"display_name": "straw2analysis", "display_name": "straw2analysis",
"language": "python", "language": "python",

View File

@ -0,0 +1,61 @@
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.11.2
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %%
import os
import sys
from tabulate import tabulate
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
import participants.query_db
# %%
from features.screen import *
# %%
df_screen_nokia = get_screen_data(["nokia_0000003"])
# %%
print(df_screen_nokia)
# %%
participants_inactive_usernames = participants.query_db.get_usernames()
df_screen_inactive = get_screen_data(participants_inactive_usernames)
# %%
df_screen_inactive["screen_status"] = (
df_screen_inactive["screen_status"]
.astype("category")
.cat.rename_categories(screen_status)
)
screen_freq = df_screen_inactive.value_counts("screen_status")
tabulate(screen_freq.to_frame(), tablefmt="html")
# %%
screen_status
# %% [markdown]
# A typical sequence might be: off -> locked -> on -> unlocked (0 -> 2 -> 1 -> 3)
# %%
status_diff = df_screen_nokia.sort_values("timestamp")["screen_status"].diff()
status_diff.value_counts().to_frame()
# %% [markdown]
# But I have also seen off -> on -> unlocked (with 2 - locked missing) and off -> locked -> on -> off -> locked (*again*).

View File

@ -2,7 +2,7 @@ from typing import List
import pandas as pd import pandas as pd
from config.models import Screen, Participant from config.models import Participant, Screen
from setup import db_engine, session from setup import db_engine, session
screen_status = {0: "off", 1: "on", 2: "locked", 3: "unlocked"} screen_status = {0: "off", 1: "on", 2: "locked", 3: "unlocked"}