stress_at_work_analysis/features/communication.py

from typing import List

import pandas as pd

from config.models import Call, Participant
from setup import db_engine, session


def get_call_data(usernames: List) -> pd.DataFrame:
    query_calls = (
        session.query(Call, Participant.username)
        .filter(Participant.id == Call.participant_id)
        .filter(Participant.username.in_(usernames))
    )
    with db_engine.connect() as connection:
        df_calls = pd.read_sql(query_calls.statement, connection)
    return df_calls


def enumerate_contacts(comm_df: pd.DataFrame) -> pd.DataFrame:
    """" Count contacts (callers, senders) and enumerate them by their frequency. """
    contact_counts = (
        comm_df["trace"]
        .value_counts(sort=True, ascending=False)
        .to_frame(name="frequency")
    )
    # A frequency table of different traces (contacts).
    contact_counts["contact_id"] = list(range(len(contact_counts.index)))
    contact_code = contact_counts["contact_id"].to_dict()
    # Create a dictionary translating traces into integers, enumerated by their frequency.
    comm_df["contact_id"] = comm_df["trace"].astype("category")
    # Transform to categorical data instead of a simple character column.
    comm_df["contact_id"] = comm_df["contact_id"].cat.rename_categories(contact_code)
    # Recode the contacts into integers from 0 to n_contacts, so that the first one is contacted the most often.
    return comm_df