Add SMS data exploration and use Jupytext to save JupyterNotebooks as py scripts.

2021-05-07 12:10:46 +02:00 · 2021-05-07 12:10:46 +02:00 · 10bdc8aa1d
parent 056db73786
commit 10bdc8aa1d
7 changed files with 347 additions and 14 deletions
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -2,6 +2,6 @@
 <project version="4">
  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (straw2analysis)" project-jdk-type="Python SDK" />
  <component name="PyCharmProfessionalAdvertiser">
-    <option name="shown" value="false" />
+    <option name="shown" value="true" />
  </component>
 </project>
--- a/config/environment.yml
+++ b/config/environment.yml
@ -8,7 +8,9 @@ dependencies:
  - isort
  - flake8
  - jupyterlab
+  - jupytext
  - mypy
+  - nodejs
  - pandas
  - psycopg2
  - python-dotenv
--- a/exploration/communication.ipynb
+++ b/exploration/communication.ipynb
--- a/exploration/communication.py
+++ b/exploration/communication.py
@ -0,0 +1,126 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.11.2
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %%
+import os
+import sys
+
+import matplotlib.pyplot as plt
+
+# %%
+import seaborn as sns
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+# %%
+from features.communication import *
+
+# %% [markdown]
+# # Example of communication data and feature calculation
+
+# %%
+df_calls = get_call_data(["nokia_0000003"])
+print(df_calls)
+
+# %%
+count_comms(df_calls)
+
+# %%
+df_sms = get_sms_data(["nokia_0000003"])
+count_comms(df_sms)
+
+# %% [markdown]
+# # Call data
+
+# %%
+import participants.query_db
+
+# %%
+participants_inactive_usernames = participants.query_db.get_usernames()
+df_calls_inactive = get_call_data(participants_inactive_usernames)
+
+# %%
+df_calls_features = count_comms(df_calls_inactive)
+df_calls_features.head()
+
+# %%
+df_calls_features.describe()
+
+# %%
+calls_number = pd.wide_to_long(
+    df_calls_features.reset_index(),
+    i="participant_id",
+    j="call_type",
+    stubnames="no",
+    sep="_",
+    suffix="\D+",
+)
+
+# %%
+sns.displot(calls_number, x="no", hue="call_type", binwidth=5, element="step", height=8)
+
+# %%
+calls_duration = pd.wide_to_long(
+    df_calls_features.reset_index(),
+    i="participant_id",
+    j="call_type",
+    stubnames="duration",
+    sep="_",
+    suffix="\D+",
+)
+sns.displot(
+    calls_duration,
+    x="duration",
+    hue="call_type",
+    multiple="dodge",
+    height=8,
+    log_scale=(True, False),
+)
+
+# %% [markdown]
+# ## Most frequent contacts by participant
+
+# %%
+df_calls_inactive = enumerate_contacts(df_calls_inactive)
+df_calls_inactive.tail()
+
+# %%
+df_calls_frequent = df_calls_inactive.query("contact_id < 5")
+
+# %%
+sns.boxplot(x="contact_id", y="freq", data=df_calls_frequent)
+
+# %% [markdown]
+# # SMS data
+
+# %%
+df_sms_inactive = get_sms_data(participants_inactive_usernames)
+df_sms_features = count_comms(df_sms_inactive)
+df_sms_features.describe()
+
+# %%
+sms_number = pd.wide_to_long(
+    df_sms_features.reset_index(),
+    i="participant_id",
+    j="message_type",
+    stubnames="no",
+    sep="_",
+    suffix="\D+",
+)
+sns.displot(
+    sms_number, x="no", hue="message_type", binwidth=5, element="step", height=8
+)
--- a/exploration/screen.ipynb
+++ b/exploration/screen.ipynb
@ -231,6 +231,9 @@
  }
 ],
 "metadata": {
+  "jupytext": {
+   "formats": "ipynb,auto:percent"
+  },
  "kernelspec": {
   "display_name": "straw2analysis",
   "language": "python",
--- a/exploration/screen.py
+++ b/exploration/screen.py
@ -0,0 +1,61 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.11.2
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %%
+import os
+import sys
+
+from tabulate import tabulate
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+import participants.query_db
+
+# %%
+from features.screen import *
+
+# %%
+df_screen_nokia = get_screen_data(["nokia_0000003"])
+
+# %%
+print(df_screen_nokia)
+
+# %%
+participants_inactive_usernames = participants.query_db.get_usernames()
+df_screen_inactive = get_screen_data(participants_inactive_usernames)
+
+# %%
+df_screen_inactive["screen_status"] = (
+    df_screen_inactive["screen_status"]
+    .astype("category")
+    .cat.rename_categories(screen_status)
+)
+screen_freq = df_screen_inactive.value_counts("screen_status")
+tabulate(screen_freq.to_frame(), tablefmt="html")
+
+# %%
+screen_status
+
+# %% [markdown]
+# A typical sequence might be: off -> locked -> on -> unlocked (0 -> 2 -> 1 -> 3)
+
+# %%
+status_diff = df_screen_nokia.sort_values("timestamp")["screen_status"].diff()
+status_diff.value_counts().to_frame()
+
+# %% [markdown]
+# But I have also seen off -> on -> unlocked (with 2 - locked missing) and off -> locked -> on -> off -> locked (*again*).
--- a/features/screen.py
+++ b/features/screen.py
@ -2,7 +2,7 @@ from typing import List

 import pandas as pd

-from config.models import Screen, Participant
+from config.models import Participant, Screen
 from setup import db_engine, session

 screen_status = {0: "off", 1: "on", 2: "locked", 3: "unlocked"}