From 29b664c41b56265615d1935302442a0c0647fb60 Mon Sep 17 00:00:00 2001 From: junos Date: Tue, 2 Feb 2021 16:20:46 +0100 Subject: [PATCH] Add a method to enumerate contacts and a method to test this with call data. --- features/communication.py | 11 ++++++++++- test/test_communication.py | 34 +++++++++++++++++++++++++++------- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/features/communication.py b/features/communication.py index 73f9b7d..f2453df 100644 --- a/features/communication.py +++ b/features/communication.py @@ -18,5 +18,14 @@ def get_call_data(usernames: List) -> pd.DataFrame: def enumerate_contacts(comm_df: pd.DataFrame) -> pd.DataFrame: - # Calculate frequencies and return in descending order. + """" Count contacts (callers, senders) and enumerate them by their frequency. """ + contact_counts = comm_df["trace"].value_counts(sort=True, ascending=False).to_frame(name="frequency") + # A frequency table of different traces (contacts). + contact_counts["contact_id"] = list(range(len(contact_counts.index))) + contact_code = contact_counts["contact_id"].to_dict() + # Create a dictionary translating traces into integers, enumerated by their frequency. + comm_df["contact_id"] = comm_df["trace"].astype("category") + # Transform to categorical data instead of a simple character column. + comm_df["contact_id"] = comm_df["contact_id"].cat.rename_categories(contact_code) + # Recode the contacts into integers from 0 to n_contacts, so that the first one is contacted the most often. return comm_df diff --git a/test/test_communication.py b/test/test_communication.py index 12263fd..0fb69ab 100644 --- a/test/test_communication.py +++ b/test/test_communication.py @@ -3,6 +3,7 @@ import unittest import numpy as np import pandas as pd from numpy.random import default_rng +from pandas.testing import assert_series_equal from features.communication import enumerate_contacts, get_call_data @@ -10,15 +11,17 @@ rng = default_rng() class CallsFeatures(unittest.TestCase): - def setUp(self): + + @classmethod + def setUpClass(cls) -> None: call_rows = 10 callers = np.concatenate(( - np.repeat("caller1", 4), + np.repeat("caller1", 2), np.repeat("caller2", 3), - np.repeat("caller3", 2), + np.repeat("caller3", 4), np.repeat("caller4", 1)), axis=None) rng.shuffle(callers) - self.calls = pd.DataFrame({ + cls.calls = pd.DataFrame({ "id": np.linspace(0, call_rows - 1, num=call_rows, dtype="u4") + 100, "_id": np.linspace(0, call_rows - 1, num=call_rows, dtype="u4"), "timestamp": np.sort(rng.integers(1612169903000, 1614556703000, size=call_rows)), @@ -28,12 +31,29 @@ class CallsFeatures(unittest.TestCase): "trace": callers, "participant_id": 29 }) - print(self.calls) + + @classmethod + def assertSeriesEqual(cls, a, b, msg=None, **optional): + try: + assert_series_equal(a, b, **optional) + except AssertionError as e: + raise cls.failureException(msg) from e + + def setUp(self): + self.addTypeEqualityFunc(pd.DataFrame, self.assertSeriesEqual) def test_get_calls_data(self): calls_from_db = get_call_data(["nokia_0000003"]) self.assertIsNotNone(calls_from_db) def test_enumeration(self): - enumerate_contacts(self.calls) - #Enumerate manually and compare + self.calls["contact_id_manual"] = self.calls["trace"].astype("category") + self.calls["contact_id_manual"] = self.calls["contact_id_manual"].cat.rename_categories( + {"caller1": 2, + "caller2": 1, + "caller3": 0, + "caller4": 3} + ) + # Enumerate callers manually by their frequency as set in setUpClass. + self.calls = enumerate_contacts(self.calls) + self.assertSeriesEqual(self.calls["contact_id_manual"], self.calls["contact_id"], check_names=False)