diff --git a/features/communication.py b/features/communication.py index 35de332..2fb6bc9 100644 --- a/features/communication.py +++ b/features/communication.py @@ -8,6 +8,43 @@ from setup import db_engine, session call_types = {1: "incoming", 2: "outgoing", 3: "missed"} sms_types = {1: "received", 2: "sent"} +FEATURES_CALLS = ( + ["no_calls_all"] + + ["no_" + call_type for call_type in call_types.values()] + + ["duration_total_" + call_types.get(1), "duration_total_" + call_types.get(2)] + + ["duration_max_" + call_types.get(1), "duration_max_" + call_types.get(2)] + + ["no_" + call_types.get(1) + "_ratio", "no_" + call_types.get(2) + "_ratio"] + + ["no_contacts_calls"] +) + +# FEATURES_CALLS = +# ["no_calls_all", +# "no_incoming", "no_outgoing", "no_missed", +# "duration_total_incoming", "duration_total_outgoing", +# "duration_max_incoming", "duration_max_outgoing", +# "no_incoming_ratio", "no_outgoing_ratio", +# "no_contacts"] + +FEATURES_SMS = ( + ["no_sms_all"] + + ["no_" + sms_type for sms_type in sms_types.values()] + + ["no_" + sms_types.get(1) + "_ratio", "no_" + sms_types.get(2) + "_ratio"] + + ["no_contacts_sms"] +) +# FEATURES_SMS = +# ["no_sms_all", +# "no_received", "no_sent", +# "no_received_ratio", "no_sent_ratio", +# "no_contacts"] + +FEATURES_CONTACT = [ + "proportion_calls_all", + "proportion_calls_incoming", + "proportion_calls_outgoing", + "proportion_calls_contacts", + "proportion_calls_missed_sms_received", +] + def get_call_data(usernames: Collection) -> pd.DataFrame: """ @@ -114,10 +151,12 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame: These are: * the number of calls by type (incoming, outgoing missed) and in total, * the ratio of incoming and outgoing calls to the total number of calls, - * the total and maximum duration of calls by type, and - * the number of messages by type (received, sent). + * the total and maximum duration of calls by type, + * the number of messages by type (received, sent), and + * the number of communication contacts by type. """ if "call_type" in comm_df: + data_type = "calls" comm_counts = ( comm_df.value_counts(subset=["participant_id", "call_type"]) .unstack() @@ -125,11 +164,11 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame: .add_prefix("no_") ) # Count calls by type. - comm_counts["no_all"] = comm_counts.sum(axis=1) + comm_counts["no_calls_all"] = comm_counts.sum(axis=1) # Add a total count of calls. comm_counts = comm_counts.assign( - no_incoming_ratio=lambda x: x.no_incoming / x.no_all, - no_outgoing_ratio=lambda x: x.no_outgoing / x.no_all, + no_incoming_ratio=lambda x: x.no_incoming / x.no_calls_all, + no_outgoing_ratio=lambda x: x.no_outgoing / x.no_calls_all, ) # Ratio of incoming and outgoing calls to all calls. comm_duration_total = ( @@ -159,44 +198,56 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame: # If there were no missed calls, this exception is raised. # But we are dropping the column anyway, so no need to deal with the exception. elif "message_type" in comm_df: + data_type = "sms" comm_counts = ( comm_df.value_counts(subset=["participant_id", "message_type"]) .unstack() .rename(columns=sms_types) .add_prefix("no_") ) - comm_counts["no_all"] = comm_counts.sum(axis=1) + comm_counts["no_sms_all"] = comm_counts.sum(axis=1) # Add a total count of messages. comm_features = comm_counts.assign( - no_received_ratio=lambda x: x.no_received / x.no_all, - no_sent_ratio=lambda x: x.no_sent / x.no_all, + no_received_ratio=lambda x: x.no_received / x.no_sms_all, + no_sent_ratio=lambda x: x.no_sent / x.no_sms_all, ) # Ratio of incoming and outgoing messages to all messages. else: raise KeyError("The dataframe contains neither call_type or message_type") + comm_contacts_counts = ( + enumerate_contacts(comm_df) + .groupby(["participant_id"]) + .nunique()["contact_id"] + .rename("no_contacts_" + data_type) + ) + # Number of communication contacts + comm_features = comm_features.join(comm_contacts_counts) return comm_features -def contact_features(df_enumerated: pd.DataFrame) -> pd.DataFrame: +def contact_features(comm_df: pd.DataFrame) -> pd.DataFrame: """ - Counts the number of people contacted (for each participant) and, if - df_enumerated is a dataframe containing calls data, the total duration - of calls between a participant and each of her contacts. + For each participant and for each of his contacts, this function + counts the number of communications (by type) between them. If the + argument passed is a dataframe with calls data, it additionally counts + the total duration of calls between every pair (participant, contact). Parameters ---------- - df_enumerated: pd.DataFrame - A dataframe of calls or SMSes; return of function enumerate_contacts. + comm_df: pd.DataFrame + A dataframe of calls or SMSes. Returns ------- comm_df: pd.DataFrame - The altered dataframe with the column no_contacts and, if df_enumerated - contains calls data, an additional column total_call_duration. + A new dataframe with a row for each pair (participant, contact). """ - + df_enumerated = enumerate_contacts(comm_df) + contacts_count = ( + df_enumerated.groupby(["participant_id", "contact_id"]).size().reset_index() + ) # Check whether df contains calls or SMS data since some - # features we want to calculate are type-specyfic + # features we want to calculate are type-specific if "call_duration" in df_enumerated: # Add a column with the total duration of calls between two people duration_count = ( @@ -207,33 +258,14 @@ def contact_features(df_enumerated: pd.DataFrame) -> pd.DataFrame: .reset_index() # Make index (which is actually the participant id) a normal column .rename(columns={"call_duration": "total_call_duration"}) ) - # The new dataframe now contains columns containing information about - # participants, callers and the total duration of their calls. All that - # is now left to do is to merge the original df with the new one. - df_enumerated = df_enumerated.merge( + contacts_count = contacts_count.merge( duration_count, on=["participant_id", "contact_id"] ) - - contact_count = ( - df_enumerated.groupby(["participant_id"]) - .nunique()[ - "contact_id" - ] # For each participant, count the number of distinct contacts - .reset_index() # Make index (which is actually the participant id) a normal column - .rename(columns={"contact_id": "no_contacts"}) - ) - - df_enumerated = ( - # Merge df with the newely created df containing info about number of contacts - df_enumerated.merge(contact_count, on="participant_id") - # Sort first by participant_id and then by contact_id and - # thereby restore the inital ordering of input dataframes. - .sort_values(["participant_id", "contact_id"]) - ) - + contacts_count.rename(columns={0: "no_calls"}, inplace=True) + else: + contacts_count.rename(columns={0: "no_sms"}, inplace=True) # TODO:Determine work vs non-work contacts by work hours heuristics - - return df_enumerated + return contacts_count def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataFrame: @@ -245,14 +277,14 @@ def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataF df_calls: pd.DataFrame A dataframe of calls (return of get_call_data). df_sms: pd.DataFrame - A dataframe of calls (return of get_sms_data). + A dataframe of SMSes (return of get_sms_data). Returns ------- df_calls_sms: pd.DataFrame The list of features relating calls and sms data for every participant. These are: - * proportion_calls: + * proportion_calls_all: proportion of calls in total number of communications * proportion_calls_incoming: proportion of incoming calls in total number of incoming/received communications @@ -263,62 +295,24 @@ def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataF * proportion_calls_contacts: proportion of calls contacts in total number of communication contacts """ - count_calls = count_comms(df_calls) count_sms = count_comms(df_sms) - - count_joined = ( - count_calls.merge( - count_sms, on="participant_id", suffixes=("_calls", "_sms") - ) # Merge calls and sms features - .reset_index() # Make participant_id a regular column - .assign( - proportion_calls=( - lambda x: x.no_all_calls / (x.no_all_calls + x.no_all_sms) - ), - proportion_calls_incoming=( - lambda x: x.no_incoming / (x.no_incoming + x.no_received) - ), - proportion_calls_missed_sms_received=( - lambda x: x.no_missed / (x.no_missed + x.no_received) - ), - proportion_calls_outgoing=( - lambda x: x.no_outgoing / (x.no_outgoing + x.no_sent) - ) - # Calculate new features and create additional columns - )[ - [ - "participant_id", - "proportion_calls", - "proportion_calls_incoming", - "proportion_calls_outgoing", - "proportion_calls_missed_sms_received", - ] - ] # Filter out only the relevant features + count_joined = count_calls.join(count_sms).assign( + proportion_calls_all=( + lambda x: x.no_calls_all / (x.no_calls_all + x.no_sms_all) + ), + proportion_calls_incoming=( + lambda x: x.no_incoming / (x.no_incoming + x.no_received) + ), + proportion_calls_missed_sms_received=( + lambda x: x.no_missed / (x.no_missed + x.no_received) + ), + proportion_calls_outgoing=( + lambda x: x.no_outgoing / (x.no_outgoing + x.no_sent) + ), + proportion_calls_contacts=( + lambda x: x.no_contacts_calls / (x.no_contacts_calls + x.no_contacts_sms) + ) + # Calculate new features and create additional columns ) - - features_calls = contact_features(enumerate_contacts(df_calls)) - features_sms = contact_features(enumerate_contacts(df_sms)) - - features_joined = ( - features_calls.merge( - features_sms, on="participant_id", suffixes=("_calls", "_sms") - ) # Merge calls and sms features - .reset_index() # Make participant_id a regular column - .assign( - proportion_calls_contacts=( - lambda x: x.no_contacts_calls - / (x.no_contacts_calls + x.no_contacts_sms) - ) # Calculate new features and create additional columns - )[ - ["participant_id", "proportion_calls_contacts"] - ] # Filter out only the relevant features - # Since we are interested only in some features and ignored - # others, a lot of duplicate rows were created. Remove them. - .drop_duplicates() - ) - - # Join the newly created dataframes - df_calls_sms = count_joined.merge(features_joined, on="participant_id") - - return df_calls_sms + return count_joined diff --git a/features/screen.py b/features/screen.py index 593fadc..bf49cfd 100644 --- a/features/screen.py +++ b/features/screen.py @@ -38,8 +38,10 @@ def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame: # - OFF -> ON -> unlocked (a true phone unlock) # - OFF -> ON -> OFF/locked (no unlocking, i.e. a screen status check) # Consider that screen data is sometimes unreliable as shown in expl_screen.ipynb: - # "I have also seen off -> on -> unlocked (with 2 - locked missing) - # and off -> locked -> on -> off -> locked (*again*)." + # "I have also seen + # off -> on -> unlocked (with 2 - locked missing) + # and + # off -> locked -> on -> off -> locked (*again*)." # Either clean the data beforehand or deal with these inconsistencies in this function. pass diff --git a/test/test_communication.py b/test/test_communication.py index a767255..5258a66 100644 --- a/test/test_communication.py +++ b/test/test_communication.py @@ -5,7 +5,7 @@ import pandas as pd from numpy.random import default_rng from pandas.testing import assert_series_equal -from features.communication import count_comms, enumerate_contacts, get_call_data +from features.communication import * rng = default_rng() @@ -76,10 +76,18 @@ class CallsFeatures(unittest.TestCase): def test_count_comms_calls(self): self.features = count_comms(self.calls) - print(self.features) self.assertIsInstance(self.features, pd.DataFrame) + self.assertCountEqual(self.features.columns.to_list(), FEATURES_CALLS) def test_count_comms_sms(self): self.features = count_comms(self.sms) - print(self.features) self.assertIsInstance(self.features, pd.DataFrame) + self.assertCountEqual(self.features.columns.to_list(), FEATURES_SMS) + + def test_calls_sms_features(self): + self.features_call_sms = calls_sms_features(self.calls, self.sms) + self.assertIsInstance(self.features_call_sms, pd.DataFrame) + self.assertCountEqual( + self.features_call_sms.columns.to_list(), + FEATURES_CALLS + FEATURES_SMS + FEATURES_CONTACT, + )