61 lines
1.9 KiB
Python
61 lines
1.9 KiB
Python
from typing import List
|
|
|
|
import pandas as pd
|
|
|
|
from config.models import Call, Participant
|
|
from setup import db_engine, session
|
|
|
|
|
|
def get_call_data(usernames: List) -> pd.DataFrame:
|
|
"""
|
|
Read the data from the calls table and return it in a dataframe.
|
|
|
|
Parameters
|
|
----------
|
|
usernames: List
|
|
A list of usernames to put into the WHERE condition.
|
|
|
|
Returns
|
|
-------
|
|
df_calls: pd.DataFrame
|
|
A dataframe of call data.
|
|
"""
|
|
query_calls = (
|
|
session.query(Call, Participant.username)
|
|
.filter(Participant.id == Call.participant_id)
|
|
.filter(Participant.username.in_(usernames))
|
|
)
|
|
with db_engine.connect() as connection:
|
|
df_calls = pd.read_sql(query_calls.statement, connection)
|
|
return df_calls
|
|
|
|
|
|
def enumerate_contacts(comm_df: pd.DataFrame) -> pd.DataFrame:
|
|
"""
|
|
Count contacts (callers, senders) and enumerate them by their frequency.
|
|
|
|
Parameters
|
|
----------
|
|
comm_df: pd.DataFrame
|
|
A dataframe of calls or SMSes.
|
|
|
|
Returns
|
|
-------
|
|
comm_df: pd.DataFrame
|
|
The altered dataframe with the column contact_id, arranged by frequency.
|
|
"""
|
|
contact_counts = (
|
|
comm_df["trace"]
|
|
.value_counts(sort=True, ascending=False)
|
|
.to_frame(name="frequency")
|
|
)
|
|
# A frequency table of different traces (contacts).
|
|
contact_counts["contact_id"] = list(range(len(contact_counts.index)))
|
|
contact_code = contact_counts["contact_id"].to_dict()
|
|
# Create a dictionary translating traces into integers, enumerated by their frequency.
|
|
comm_df["contact_id"] = comm_df["trace"].astype("category")
|
|
# Transform to categorical data instead of a simple character column.
|
|
comm_df["contact_id"] = comm_df["contact_id"].cat.rename_categories(contact_code)
|
|
# Recode the contacts into integers from 0 to n_contacts, so that the first one is contacted the most often.
|
|
return comm_df
|