diff --git a/features/communication.py b/features/communication.py index 0c44259..35de332 100644 --- a/features/communication.py +++ b/features/communication.py @@ -151,10 +151,8 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame: comm_features = comm_counts.join(comm_duration_total) comm_features = comm_features.join(comm_duration_max) try: - comm_features.drop(columns="duration_total_" + - call_types[3], inplace=True) - comm_features.drop(columns="duration_max_" + - call_types[3], inplace=True) + comm_features.drop(columns="duration_total_" + call_types[3], inplace=True) + comm_features.drop(columns="duration_max_" + call_types[3], inplace=True) # The missed calls are always of 0 duration. except KeyError: pass @@ -175,8 +173,7 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame: ) # Ratio of incoming and outgoing messages to all messages. else: - raise KeyError( - "The dataframe contains neither call_type or message_type") + raise KeyError("The dataframe contains neither call_type or message_type") return comm_features @@ -203,11 +200,10 @@ def contact_features(df_enumerated: pd.DataFrame) -> pd.DataFrame: if "call_duration" in df_enumerated: # Add a column with the total duration of calls between two people duration_count = ( - df_enumerated.groupby( - ["participant_id", "contact_id"] - ) + df_enumerated.groupby(["participant_id", "contact_id"]) # For each participant and for each caller, sum durations of their calls - ["call_duration"].sum() + ["call_duration"] + .sum() .reset_index() # Make index (which is actually the participant id) a normal column .rename(columns={"call_duration": "total_call_duration"}) ) @@ -215,17 +211,18 @@ def contact_features(df_enumerated: pd.DataFrame) -> pd.DataFrame: # participants, callers and the total duration of their calls. All that # is now left to do is to merge the original df with the new one. df_enumerated = df_enumerated.merge( - duration_count, - on=["participant_id", "contact_id"] - ) + duration_count, on=["participant_id", "contact_id"] + ) contact_count = ( df_enumerated.groupby(["participant_id"]) - .nunique()["contact_id"] # For each participant, count the number of distinct contacts + .nunique()[ + "contact_id" + ] # For each participant, count the number of distinct contacts .reset_index() # Make index (which is actually the participant id) a normal column .rename(columns={"contact_id": "no_contacts"}) ) - + df_enumerated = ( # Merge df with the newely created df containing info about number of contacts df_enumerated.merge(contact_count, on="participant_id") @@ -258,7 +255,7 @@ def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataF * proportion_calls: proportion of calls in total number of communications * proportion_calls_incoming: - proportion of incoming calls in total number of incoming/recieved communications + proportion of incoming calls in total number of incoming/received communications * proportion_calls_outgoing: proportion of outgoing calls in total number of outgoing/sent communications * proportion_calls_missed_sms_received: @@ -290,12 +287,14 @@ def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataF ) # Calculate new features and create additional columns )[ - ["participant_id", - "proportion_calls", - "proportion_calls_incoming", - "proportion_calls_outgoing", - "proportion_calls_missed_sms_received"] - ] # Filter out only the relevant feautres + [ + "participant_id", + "proportion_calls", + "proportion_calls_incoming", + "proportion_calls_outgoing", + "proportion_calls_missed_sms_received", + ] + ] # Filter out only the relevant features ) features_calls = contact_features(enumerate_contacts(df_calls)) @@ -305,22 +304,21 @@ def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataF features_calls.merge( features_sms, on="participant_id", suffixes=("_calls", "_sms") ) # Merge calls and sms features - .reset_index() # Make participand_id a regular column + .reset_index() # Make participant_id a regular column .assign( proportion_calls_contacts=( - lambda x: x.no_contacts_calls / - (x.no_contacts_calls + x.no_contacts_sms) + lambda x: x.no_contacts_calls + / (x.no_contacts_calls + x.no_contacts_sms) ) # Calculate new features and create additional columns )[ - ["participant_id", - "proportion_calls_contacts"] - ] # Filter out only the relevant feautres + ["participant_id", "proportion_calls_contacts"] + ] # Filter out only the relevant features # Since we are interested only in some features and ignored # others, a lot of duplicate rows were created. Remove them. .drop_duplicates() ) - # Join the newely created dataframes + # Join the newly created dataframes df_calls_sms = count_joined.merge(features_joined, on="participant_id") return df_calls_sms