Add SMS data exploration and use Jupytext to save JupyterNotebooks as py scripts.

2021-05-07 12:10:46 +02:00 · 2021-05-07 12:10:46 +02:00 · 10bdc8aa1d
parent 056db73786
commit 10bdc8aa1d
7 changed files with 347 additions and 14 deletions
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -2,6 +2,6 @@
 <project version="4">
  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (straw2analysis)" project-jdk-type="Python SDK" />
  <component name="PyCharmProfessionalAdvertiser">
-    <option name="shown" value="false" />
+    <option name="shown" value="true" />
  </component>
 </project>
--- a/config/environment.yml
+++ b/config/environment.yml
@ -8,7 +8,9 @@ dependencies:
  - isort
  - flake8
  - jupyterlab
  - jupytext
  - mypy
  - nodejs
  - pandas
  - psycopg2
  - python-dotenv
--- a/exploration/communication.ipynb
+++ b/exploration/communication.ipynb
--- a/exploration/communication.py
+++ b/exploration/communication.py
@ -0,0 +1,126 @@
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:percent
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.11.2
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
 # %%
 import os
 import sys
 import matplotlib.pyplot as plt
 # %%
 import seaborn as sns
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 # %%
 from features.communication import *
 # %% [markdown]
 # # Example of communication data and feature calculation
 # %%
 df_calls = get_call_data(["nokia_0000003"])
 print(df_calls)
 # %%
 count_comms(df_calls)
 # %%
 df_sms = get_sms_data(["nokia_0000003"])
 count_comms(df_sms)
 # %% [markdown]
 # # Call data
 # %%
 import participants.query_db
 # %%
 participants_inactive_usernames = participants.query_db.get_usernames()
 df_calls_inactive = get_call_data(participants_inactive_usernames)
 # %%
 df_calls_features = count_comms(df_calls_inactive)
 df_calls_features.head()
 # %%
 df_calls_features.describe()
 # %%
 calls_number = pd.wide_to_long(
    df_calls_features.reset_index(),
    i="participant_id",
    j="call_type",
    stubnames="no",
    sep="_",
    suffix="\D+",
 )
 # %%
 sns.displot(calls_number, x="no", hue="call_type", binwidth=5, element="step", height=8)
 # %%
 calls_duration = pd.wide_to_long(
    df_calls_features.reset_index(),
    i="participant_id",
    j="call_type",
    stubnames="duration",
    sep="_",
    suffix="\D+",
 )
 sns.displot(
    calls_duration,
    x="duration",
    hue="call_type",
    multiple="dodge",
    height=8,
    log_scale=(True, False),
 )
 # %% [markdown]
 # ## Most frequent contacts by participant
 # %%
 df_calls_inactive = enumerate_contacts(df_calls_inactive)
 df_calls_inactive.tail()
 # %%
 df_calls_frequent = df_calls_inactive.query("contact_id < 5")
 # %%
 sns.boxplot(x="contact_id", y="freq", data=df_calls_frequent)
 # %% [markdown]
 # # SMS data
 # %%
 df_sms_inactive = get_sms_data(participants_inactive_usernames)
 df_sms_features = count_comms(df_sms_inactive)
 df_sms_features.describe()
 # %%
 sms_number = pd.wide_to_long(
    df_sms_features.reset_index(),
    i="participant_id",
    j="message_type",
    stubnames="no",
    sep="_",
    suffix="\D+",
 )
 sns.displot(
    sms_number, x="no", hue="message_type", binwidth=5, element="step", height=8
 )
--- a/exploration/screen.ipynb
+++ b/exploration/screen.ipynb
@ -231,6 +231,9 @@
  }
 ],
 "metadata": {
  "jupytext": {
   "formats": "ipynb,auto:percent"
  },
  "kernelspec": {
   "display_name": "straw2analysis",
   "language": "python",
--- a/exploration/screen.py
+++ b/exploration/screen.py
@ -0,0 +1,61 @@
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:percent
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.11.2
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
 # %%
 import os
 import sys
 from tabulate import tabulate
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 import participants.query_db
 # %%
 from features.screen import *
 # %%
 df_screen_nokia = get_screen_data(["nokia_0000003"])
 # %%
 print(df_screen_nokia)
 # %%
 participants_inactive_usernames = participants.query_db.get_usernames()
 df_screen_inactive = get_screen_data(participants_inactive_usernames)
 # %%
 df_screen_inactive["screen_status"] = (
    df_screen_inactive["screen_status"]
    .astype("category")
    .cat.rename_categories(screen_status)
 )
 screen_freq = df_screen_inactive.value_counts("screen_status")
 tabulate(screen_freq.to_frame(), tablefmt="html")
 # %%
 screen_status
 # %% [markdown]
 # A typical sequence might be: off -> locked -> on -> unlocked (0 -> 2 -> 1 -> 3)
 # %%
 status_diff = df_screen_nokia.sort_values("timestamp")["screen_status"].diff()
 status_diff.value_counts().to_frame()
 # %% [markdown]
 # But I have also seen off -> on -> unlocked (with 2 - locked missing) and off -> locked -> on -> off -> locked (*again*).
--- a/features/screen.py
+++ b/features/screen.py
@ -2,7 +2,7 @@ from typing import List
 import pandas as pd
-from config.models import Screen, Participant
+from config.models import Participant, Screen
 from setup import db_engine, session
 screen_status = {0: "off", 1: "on", 2: "locked", 3: "unlocked"}