Add a method to enumerate contacts and a method to test this with call data.

communication
junos 2021-02-02 16:20:46 +01:00
parent 0bc66ce4b9
commit 29b664c41b
2 changed files with 37 additions and 8 deletions

View File

@ -18,5 +18,14 @@ def get_call_data(usernames: List) -> pd.DataFrame:
def enumerate_contacts(comm_df: pd.DataFrame) -> pd.DataFrame:
# Calculate frequencies and return in descending order.
"""" Count contacts (callers, senders) and enumerate them by their frequency. """
contact_counts = comm_df["trace"].value_counts(sort=True, ascending=False).to_frame(name="frequency")
# A frequency table of different traces (contacts).
contact_counts["contact_id"] = list(range(len(contact_counts.index)))
contact_code = contact_counts["contact_id"].to_dict()
# Create a dictionary translating traces into integers, enumerated by their frequency.
comm_df["contact_id"] = comm_df["trace"].astype("category")
# Transform to categorical data instead of a simple character column.
comm_df["contact_id"] = comm_df["contact_id"].cat.rename_categories(contact_code)
# Recode the contacts into integers from 0 to n_contacts, so that the first one is contacted the most often.
return comm_df

View File

@ -3,6 +3,7 @@ import unittest
import numpy as np
import pandas as pd
from numpy.random import default_rng
from pandas.testing import assert_series_equal
from features.communication import enumerate_contacts, get_call_data
@ -10,15 +11,17 @@ rng = default_rng()
class CallsFeatures(unittest.TestCase):
def setUp(self):
@classmethod
def setUpClass(cls) -> None:
call_rows = 10
callers = np.concatenate((
np.repeat("caller1", 4),
np.repeat("caller1", 2),
np.repeat("caller2", 3),
np.repeat("caller3", 2),
np.repeat("caller3", 4),
np.repeat("caller4", 1)), axis=None)
rng.shuffle(callers)
self.calls = pd.DataFrame({
cls.calls = pd.DataFrame({
"id": np.linspace(0, call_rows - 1, num=call_rows, dtype="u4") + 100,
"_id": np.linspace(0, call_rows - 1, num=call_rows, dtype="u4"),
"timestamp": np.sort(rng.integers(1612169903000, 1614556703000, size=call_rows)),
@ -28,12 +31,29 @@ class CallsFeatures(unittest.TestCase):
"trace": callers,
"participant_id": 29
})
print(self.calls)
@classmethod
def assertSeriesEqual(cls, a, b, msg=None, **optional):
try:
assert_series_equal(a, b, **optional)
except AssertionError as e:
raise cls.failureException(msg) from e
def setUp(self):
self.addTypeEqualityFunc(pd.DataFrame, self.assertSeriesEqual)
def test_get_calls_data(self):
calls_from_db = get_call_data(["nokia_0000003"])
self.assertIsNotNone(calls_from_db)
def test_enumeration(self):
enumerate_contacts(self.calls)
#Enumerate manually and compare
self.calls["contact_id_manual"] = self.calls["trace"].astype("category")
self.calls["contact_id_manual"] = self.calls["contact_id_manual"].cat.rename_categories(
{"caller1": 2,
"caller2": 1,
"caller3": 0,
"caller4": 3}
)
# Enumerate callers manually by their frequency as set in setUpClass.
self.calls = enumerate_contacts(self.calls)
self.assertSeriesEqual(self.calls["contact_id_manual"], self.calls["contact_id"], check_names=False)