stress_at_work_analysis/features/communication.py

36 lines
1.4 KiB
Python

from typing import List
import pandas as pd
from config.models import Call, Participant
from setup import db_engine, session
def get_call_data(usernames: List) -> pd.DataFrame:
query_calls = (
session.query(Call, Participant.username)
.filter(Participant.id == Call.participant_id)
.filter(Participant.username.in_(usernames))
)
with db_engine.connect() as connection:
df_calls = pd.read_sql(query_calls.statement, connection)
return df_calls
def enumerate_contacts(comm_df: pd.DataFrame) -> pd.DataFrame:
"""" Count contacts (callers, senders) and enumerate them by their frequency. """
contact_counts = (
comm_df["trace"]
.value_counts(sort=True, ascending=False)
.to_frame(name="frequency")
)
# A frequency table of different traces (contacts).
contact_counts["contact_id"] = list(range(len(contact_counts.index)))
contact_code = contact_counts["contact_id"].to_dict()
# Create a dictionary translating traces into integers, enumerated by their frequency.
comm_df["contact_id"] = comm_df["trace"].astype("category")
# Transform to categorical data instead of a simple character column.
comm_df["contact_id"] = comm_df["contact_id"].cat.rename_categories(contact_code)
# Recode the contacts into integers from 0 to n_contacts, so that the first one is contacted the most often.
return comm_df