From 74b4f9ddbeffd37766422dfa38e0ebb7c468b318 Mon Sep 17 00:00:00 2001 From: Ivan Kobe Date: Tue, 10 Aug 2021 12:34:21 +0200 Subject: [PATCH] separated features --- features/communication.py | 120 +++++++++++++++++--------------------- features/screen.py | 6 +- 2 files changed, 58 insertions(+), 68 deletions(-) diff --git a/features/communication.py b/features/communication.py index 35de332..5607bec 100644 --- a/features/communication.py +++ b/features/communication.py @@ -114,8 +114,9 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame: These are: * the number of calls by type (incoming, outgoing missed) and in total, * the ratio of incoming and outgoing calls to the total number of calls, - * the total and maximum duration of calls by type, and - * the number of messages by type (received, sent). + * the total and maximum duration of calls by type, + * the number of messages by type (received, sent), and + * the number of communication contacts by type. """ if "call_type" in comm_df: comm_counts = ( @@ -148,8 +149,20 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame: .add_prefix("duration_max_") ) # Max call duration by type + comm_contacts_counts = ( + enumerate_contacts(comm_df) + .groupby(["participant_id"]) + .nunique()["contact_id"] + .reset_index() + .rename(columns={"contact_id": "no_contacts"}) + ) + # Number of communication contacts comm_features = comm_counts.join(comm_duration_total) comm_features = comm_features.join(comm_duration_max) + comm_features = comm_features.merge( + comm_contacts_counts, + on="participant_id" + ).set_index("participant_id") try: comm_features.drop(columns="duration_total_" + call_types[3], inplace=True) comm_features.drop(columns="duration_max_" + call_types[3], inplace=True) @@ -172,68 +185,66 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame: no_sent_ratio=lambda x: x.no_sent / x.no_all, ) # Ratio of incoming and outgoing messages to all messages. + comm_contacts_counts = ( + enumerate_contacts(comm_df) + .groupby(["participant_id"]) + .nunique()["contact_id"] + .reset_index() + .rename(columns={"contact_id": "no_contacts"}) + ) + # Number of communication contacts + comm_features = comm_features.merge( + comm_contacts_counts, + on="participant_id" + ).set_index("participant_id") else: raise KeyError("The dataframe contains neither call_type or message_type") return comm_features -def contact_features(df_enumerated: pd.DataFrame) -> pd.DataFrame: +def contact_features(comm_df: pd.DataFrame) -> pd.DataFrame: """ - Counts the number of people contacted (for each participant) and, if - df_enumerated is a dataframe containing calls data, the total duration - of calls between a participant and each of her contacts. + For each participant and for each of his contacts, this function + counts the number of communications (by type) between them. If the + argument passed is a dataframe with calls data, it additionally counts + the total duration of calls between every pair (participant, contact). Parameters ---------- df_enumerated: pd.DataFrame - A dataframe of calls or SMSes; return of function enumerate_contacts. + A dataframe of calls or SMSes. Returns ------- comm_df: pd.DataFrame - The altered dataframe with the column no_contacts and, if df_enumerated - contains calls data, an additional column total_call_duration. + A new dataframe with a row for each pair (participant, contact). """ - + df_enumerated = enumerate_contacts(comm_df) + contacts_count = ( + df_enumerated + .groupby(["participant_id","contact_id"]) + .size() + .reset_index() + ) # Check whether df contains calls or SMS data since some # features we want to calculate are type-specyfic if "call_duration" in df_enumerated: # Add a column with the total duration of calls between two people duration_count = ( - df_enumerated.groupby(["participant_id", "contact_id"]) + df_enumerated + .groupby(["participant_id", "contact_id"]) # For each participant and for each caller, sum durations of their calls ["call_duration"] .sum() .reset_index() # Make index (which is actually the participant id) a normal column .rename(columns={"call_duration": "total_call_duration"}) ) - # The new dataframe now contains columns containing information about - # participants, callers and the total duration of their calls. All that - # is now left to do is to merge the original df with the new one. - df_enumerated = df_enumerated.merge( - duration_count, on=["participant_id", "contact_id"] - ) - - contact_count = ( - df_enumerated.groupby(["participant_id"]) - .nunique()[ - "contact_id" - ] # For each participant, count the number of distinct contacts - .reset_index() # Make index (which is actually the participant id) a normal column - .rename(columns={"contact_id": "no_contacts"}) - ) - - df_enumerated = ( - # Merge df with the newely created df containing info about number of contacts - df_enumerated.merge(contact_count, on="participant_id") - # Sort first by participant_id and then by contact_id and - # thereby restore the inital ordering of input dataframes. - .sort_values(["participant_id", "contact_id"]) - ) - + contacts_count = contacts_count.merge(duration_count, on=["participant_id", "contact_id"]) + contacts_count.rename(columns={0:"no_calls"}, inplace=True) + else: + contacts_count.rename(columns={0:"no_sms"}, inplace=True) # TODO:Determine work vs non-work contacts by work hours heuristics - - return df_enumerated + return contacts_count def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataFrame: @@ -245,7 +256,7 @@ def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataF df_calls: pd.DataFrame A dataframe of calls (return of get_call_data). df_sms: pd.DataFrame - A dataframe of calls (return of get_sms_data). + A dataframe of SMSes (return of get_sms_data). Returns ------- @@ -263,10 +274,8 @@ def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataF * proportion_calls_contacts: proportion of calls contacts in total number of communication contacts """ - count_calls = count_comms(df_calls) count_sms = count_comms(df_sms) - count_joined = ( count_calls.merge( count_sms, on="participant_id", suffixes=("_calls", "_sms") @@ -284,6 +293,9 @@ def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataF ), proportion_calls_outgoing=( lambda x: x.no_outgoing / (x.no_outgoing + x.no_sent) + ), + proportion_calls_contacts=( + lambda x: x.no_contacts_calls / (x.no_contacts_calls + x.no_contacts_sms) ) # Calculate new features and create additional columns )[ @@ -292,33 +304,9 @@ def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataF "proportion_calls", "proportion_calls_incoming", "proportion_calls_outgoing", + "proportion_calls_contacts", "proportion_calls_missed_sms_received", ] ] # Filter out only the relevant features ) - - features_calls = contact_features(enumerate_contacts(df_calls)) - features_sms = contact_features(enumerate_contacts(df_sms)) - - features_joined = ( - features_calls.merge( - features_sms, on="participant_id", suffixes=("_calls", "_sms") - ) # Merge calls and sms features - .reset_index() # Make participant_id a regular column - .assign( - proportion_calls_contacts=( - lambda x: x.no_contacts_calls - / (x.no_contacts_calls + x.no_contacts_sms) - ) # Calculate new features and create additional columns - )[ - ["participant_id", "proportion_calls_contacts"] - ] # Filter out only the relevant features - # Since we are interested only in some features and ignored - # others, a lot of duplicate rows were created. Remove them. - .drop_duplicates() - ) - - # Join the newly created dataframes - df_calls_sms = count_joined.merge(features_joined, on="participant_id") - - return df_calls_sms + return count_joined diff --git a/features/screen.py b/features/screen.py index 593fadc..bf49cfd 100644 --- a/features/screen.py +++ b/features/screen.py @@ -38,8 +38,10 @@ def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame: # - OFF -> ON -> unlocked (a true phone unlock) # - OFF -> ON -> OFF/locked (no unlocking, i.e. a screen status check) # Consider that screen data is sometimes unreliable as shown in expl_screen.ipynb: - # "I have also seen off -> on -> unlocked (with 2 - locked missing) - # and off -> locked -> on -> off -> locked (*again*)." + # "I have also seen + # off -> on -> unlocked (with 2 - locked missing) + # and + # off -> locked -> on -> off -> locked (*again*)." # Either clean the data beforehand or deal with these inconsistencies in this function. pass