From 6e9c13d5d842753a7c480403954c3e7978c79ec1 Mon Sep 17 00:00:00 2001 From: junos Date: Fri, 7 May 2021 15:18:56 +0200 Subject: [PATCH] Add more call features. The total duration of calls and maximum duration by type. The number of all calls and ratio of incoming and outgoing calls. --- config/ORMtutorial.py | 42 +++++++++++++++++++----------- features/communication.py | 53 +++++++++++++++++++++++++++++++------- features/screen.py | 2 +- test/test_communication.py | 1 + 4 files changed, 73 insertions(+), 25 deletions(-) diff --git a/config/ORMtutorial.py b/config/ORMtutorial.py index 66274d9..52c08b7 100644 --- a/config/ORMtutorial.py +++ b/config/ORMtutorial.py @@ -14,20 +14,25 @@ # %% import sqlalchemy + print(sqlalchemy.__version__) # %% from sqlalchemy import create_engine -engine = create_engine('sqlite:///:memory:', echo=True) + +engine = create_engine("sqlite:///:memory:", echo=True) # %% from sqlalchemy.ext.declarative import declarative_base + Base = declarative_base() # %% from sqlalchemy import Column, Integer, String + + class User(Base): - __tablename__ = 'users' + __tablename__ = "users" id = Column(Integer, primary_key=True) name = Column(String) @@ -36,7 +41,10 @@ class User(Base): def __repr__(self): return "" % ( - self.name, self.fullname, self.nickname) + self.name, + self.fullname, + self.nickname, + ) # %% @@ -46,13 +54,14 @@ print(User.__table__) Base.metadata.create_all(engine) # %% -ed_user = User(name='ed', fullname='Ed Jones', nickname='edsnickname') +ed_user = User(name="ed", fullname="Ed Jones", nickname="edsnickname") print(ed_user.name) print(ed_user.nickname) print(str(ed_user.id)) # %% from sqlalchemy.orm import sessionmaker + Session = sessionmaker(bind=engine) session = Session() @@ -60,24 +69,27 @@ session = Session() # # Adding and Updating Objects # %% -ed_user = User(name='ed', fullname='Ed Jones', nickname='edsnickname') +ed_user = User(name="ed", fullname="Ed Jones", nickname="edsnickname") session.add(ed_user) # %% -our_user = session.query(User).filter_by(name='ed').first() +our_user = session.query(User).filter_by(name="ed").first() print(our_user) # %% print(ed_user is our_user) # %% -session.add_all([ - User(name='wendy', fullname='Wendy Williams', nickname='windy'), - User(name='mary', fullname='Mary Contrary', nickname='mary'), - User(name='fred', fullname='Fred Flintstone', nickname='freddy')]) +session.add_all( + [ + User(name="wendy", fullname="Wendy Williams", nickname="windy"), + User(name="mary", fullname="Mary Contrary", nickname="mary"), + User(name="fred", fullname="Fred Flintstone", nickname="freddy"), + ] +) # %% -ed_user.nickname = 'eddie' +ed_user.nickname = "eddie" # %% print(session.dirty) @@ -95,14 +107,14 @@ print(ed_user.id) # # Rolling back # %% -ed_user.name = 'Edwardo' +ed_user.name = "Edwardo" # %% -fake_user = User(name='fakeuser', fullname='Invalid', nickname='12345') +fake_user = User(name="fakeuser", fullname="Invalid", nickname="12345") session.add(fake_user) # %% -session.query(User).filter(User.name.in_(['Edwardo', 'fakeuser'])).all() +session.query(User).filter(User.name.in_(["Edwardo", "fakeuser"])).all() # %% session.rollback() @@ -110,4 +122,4 @@ print(ed_user.name) print(fake_user in session) # %% -session.query(User).filter(User.name.in_(['ed', 'fakeuser'])).all() +session.query(User).filter(User.name.in_(["ed", "fakeuser"])).all() diff --git a/features/communication.py b/features/communication.py index 6a3d8e3..a2acfdb 100644 --- a/features/communication.py +++ b/features/communication.py @@ -15,7 +15,7 @@ def get_call_data(usernames: Collection) -> pd.DataFrame: Parameters ---------- - usernames: List + usernames: Collection A list of usernames to put into the WHERE condition. Returns @@ -39,7 +39,7 @@ def get_sms_data(usernames: Collection) -> pd.DataFrame: Parameters ---------- - usernames: List + usernames: Collection A list of usernames to put into the WHERE condition. Returns @@ -111,9 +111,10 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame: comm_features: pd.DataFrame A list of communication features for every participant. These are: - * the number of messages by type (received, sent), - * the number of calls by type (incoming, outgoing missed), and - * the duration of calls by type. + * the number of calls by type (incoming, outgoing missed) and in total, + * the ratio of incoming and outgoing calls to the total number of calls, + * the total and maximum duration of calls by type, and + * the number of messages by type (received, sent). """ if "call_type" in comm_df: comm_counts = ( @@ -122,16 +123,35 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame: .rename(columns=call_types) .add_prefix("no_") ) - comm_duration = ( + # Count calls by type. + comm_counts["no_all"] = comm_counts.sum(axis=1) + # Add a total count of calls. + comm_counts = comm_counts.assign( + no_incoming_ratio=lambda x: x.no_incoming / x.no_all, + no_outgoing_ratio=lambda x: x.no_outgoing / x.no_all, + ) + # Ratio of incoming and outgoing calls to all calls. + comm_duration_total = ( comm_df.groupby(["participant_id", "call_type"]) .sum()["call_duration"] .unstack() .rename(columns=call_types) - .add_prefix("duration_") + .add_prefix("duration_total_") ) - comm_features = comm_counts.join(comm_duration) + # Total call duration by type. + comm_duration_max = ( + comm_df.groupby(["participant_id", "call_type"]) + .max()["call_duration"] + .unstack() + .rename(columns=call_types) + .add_prefix("duration_max_") + ) + # Max call duration by type + comm_features = comm_counts.join(comm_duration_total) + comm_features = comm_features.join(comm_duration_max) try: - comm_features.drop(columns="duration_" + call_types[3], inplace=True) + comm_features.drop(columns="duration_total_" + call_types[3], inplace=True) + comm_features.drop(columns="duration_max_" + call_types[3], inplace=True) # The missed calls are always of 0 duration. except KeyError: pass @@ -145,6 +165,21 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame: .add_prefix("no_") ) comm_features = comm_counts + # TODO Add ratio of outgoing and incoming texts. else: raise KeyError("The dataframe contains neither call_type or message_type") return comm_features + + +def contact_features(): + # TODO Implement a method that takes a DF with enumerated contacts as argument and calculates: + # * Duration of calls per caller (for most common callers) + # * Determine work vs non-work contacts by work hours heuristics + # * Numer of people contacted + # And similarly for SMS. + pass + + +def calls_sms_features(): + # TODO Relate the calls and sms data, such as comparing the number of (missed) calls and messages. + pass diff --git a/features/screen.py b/features/screen.py index d639b45..92fcba0 100644 --- a/features/screen.py +++ b/features/screen.py @@ -14,7 +14,7 @@ def get_screen_data(usernames: Collection) -> pd.DataFrame: Parameters ---------- - usernames: List + usernames: Collection A list of usernames to put into the WHERE condition. Returns diff --git a/test/test_communication.py b/test/test_communication.py index c973f3b..6d6d751 100644 --- a/test/test_communication.py +++ b/test/test_communication.py @@ -71,4 +71,5 @@ class CallsFeatures(unittest.TestCase): def test_count_comms(self): self.features = count_comms(self.calls) + print(self.features) self.assertIsInstance(self.features, pd.DataFrame)