From e7fe4e8398c3440aa940ba93fbddd797ce473797 Mon Sep 17 00:00:00 2001 From: junos Date: Tue, 17 Aug 2021 13:49:53 +0200 Subject: [PATCH] Simplify merge into join. --- features/communication.py | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/features/communication.py b/features/communication.py index 5607bec..b41e068 100644 --- a/features/communication.py +++ b/features/communication.py @@ -152,17 +152,13 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame: comm_contacts_counts = ( enumerate_contacts(comm_df) .groupby(["participant_id"]) - .nunique()["contact_id"] - .reset_index() + .nunique() .rename(columns={"contact_id": "no_contacts"}) ) # Number of communication contacts comm_features = comm_counts.join(comm_duration_total) comm_features = comm_features.join(comm_duration_max) - comm_features = comm_features.merge( - comm_contacts_counts, - on="participant_id" - ).set_index("participant_id") + comm_features = comm_features.join(comm_contacts_counts) try: comm_features.drop(columns="duration_total_" + call_types[3], inplace=True) comm_features.drop(columns="duration_max_" + call_types[3], inplace=True) @@ -188,15 +184,11 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame: comm_contacts_counts = ( enumerate_contacts(comm_df) .groupby(["participant_id"]) - .nunique()["contact_id"] - .reset_index() + .nunique() .rename(columns={"contact_id": "no_contacts"}) ) # Number of communication contacts - comm_features = comm_features.merge( - comm_contacts_counts, - on="participant_id" - ).set_index("participant_id") + comm_features = comm_features.join(comm_contacts_counts) else: raise KeyError("The dataframe contains neither call_type or message_type") return comm_features @@ -221,28 +213,26 @@ def contact_features(comm_df: pd.DataFrame) -> pd.DataFrame: """ df_enumerated = enumerate_contacts(comm_df) contacts_count = ( - df_enumerated - .groupby(["participant_id","contact_id"]) - .size() - .reset_index() + df_enumerated.groupby(["participant_id", "contact_id"]).size().reset_index() ) # Check whether df contains calls or SMS data since some # features we want to calculate are type-specyfic if "call_duration" in df_enumerated: # Add a column with the total duration of calls between two people duration_count = ( - df_enumerated - .groupby(["participant_id", "contact_id"]) + df_enumerated.groupby(["participant_id", "contact_id"]) # For each participant and for each caller, sum durations of their calls ["call_duration"] .sum() .reset_index() # Make index (which is actually the participant id) a normal column .rename(columns={"call_duration": "total_call_duration"}) ) - contacts_count = contacts_count.merge(duration_count, on=["participant_id", "contact_id"]) - contacts_count.rename(columns={0:"no_calls"}, inplace=True) + contacts_count = contacts_count.merge( + duration_count, on=["participant_id", "contact_id"] + ) + contacts_count.rename(columns={0: "no_calls"}, inplace=True) else: - contacts_count.rename(columns={0:"no_sms"}, inplace=True) + contacts_count.rename(columns={0: "no_sms"}, inplace=True) # TODO:Determine work vs non-work contacts by work hours heuristics return contacts_count @@ -295,7 +285,8 @@ def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataF lambda x: x.no_outgoing / (x.no_outgoing + x.no_sent) ), proportion_calls_contacts=( - lambda x: x.no_contacts_calls / (x.no_contacts_calls + x.no_contacts_sms) + lambda x: x.no_contacts_calls + / (x.no_contacts_calls + x.no_contacts_sms) ) # Calculate new features and create additional columns )[