diff --git a/config/models.py b/config/models.py index 0735a92..f5edefe 100644 --- a/config/models.py +++ b/config/models.py @@ -47,6 +47,7 @@ class Participant(Base): tester: bool Is this a tester (or a true participant)? """ + __tablename__ = "participants" id = Column(Integer, primary_key=True) username = Column(String(64), index=True, unique=True) @@ -96,6 +97,7 @@ class AWAREsensor(object): participant_id: int The foreign key relating (with the relationship) tables to the participants table. """ + id = Column(BigInteger, primary_key=True, nullable=False) _id = Column(BigInteger, nullable=False) timestamp = Column(BigInteger, nullable=False) @@ -205,6 +207,7 @@ class Call(Base, AWAREsensor): trace: str(40) A hash value SHA-1 of the phone number (source or target) of the call """ + call_type = Column(SmallInteger, nullable=False) call_duration = Column(Integer, nullable=False) trace = Column(String(length=40), nullable=True) @@ -345,6 +348,7 @@ class SMS(Base, AWAREsensor): trace: str(40) A hash value SHA-1 of the phone number (source or target) of the call """ + message_type = Column(SmallInteger, nullable=False) trace = Column(String(length=40), nullable=False) diff --git a/features/communication.py b/features/communication.py index 97d985d..d5aebcf 100644 --- a/features/communication.py +++ b/features/communication.py @@ -5,6 +5,9 @@ import pandas as pd from config.models import Call, Participant from setup import db_engine, session +call_types = {1: "incoming", 2: "outgoing", 3: "missed"} +sms_types = {1: "received", 2: "sent"} + def get_call_data(usernames: List) -> pd.DataFrame: """ @@ -58,3 +61,33 @@ def enumerate_contacts(comm_df: pd.DataFrame) -> pd.DataFrame: comm_df["contact_id"] = comm_df["contact_id"].cat.rename_categories(contact_code) # Recode the contacts into integers from 0 to n_contacts, so that the first one is contacted the most often. return comm_df + + +def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame: + + if "call_type" in comm_df: + comm_counts = ( + comm_df.value_counts(subset=["participant_id", "call_type"]) + .unstack() + .rename(columns=call_types) + .add_prefix("no_") + ) + comm_duration = ( + comm_df.groupby(["participant_id", "call_type"]) + .sum()["call_duration"] + .unstack() + .rename(columns=call_types) + .add_prefix("duration_") + ) + comm_features = comm_counts.join(comm_duration) + elif "message_type" in comm_df: + comm_counts = ( + comm_df.value_counts(subset=["participant_id", "message_type"]) + .unstack() + .rename(columns=sms_types) + .add_prefix("no_") + ) + comm_features = comm_counts + else: + raise KeyError("The dataframe contains neither call_type or message_type") + return comm_features