diff --git a/features/communication.py b/features/communication.py index 209eabd..d92bd29 100644 --- a/features/communication.py +++ b/features/communication.py @@ -166,7 +166,7 @@ def count_comms(comm_df: pd.DataFrame, group_by=None) -> pd.DataFrame: data_type = "calls" comm_counts = ( comm_df.value_counts(subset=group_by + ["participant_id", "call_type"]) - .unstack() + .unstack(level="call_type", fill_value=0) .rename(columns=call_types) .add_prefix("no_") ) @@ -181,7 +181,7 @@ def count_comms(comm_df: pd.DataFrame, group_by=None) -> pd.DataFrame: comm_duration_total = ( comm_df.groupby(group_by + ["participant_id", "call_type"]) .sum()["call_duration"] - .unstack() + .unstack(level="call_type", fill_value=0) .rename(columns=call_types) .add_prefix("duration_total_") ) @@ -189,7 +189,7 @@ def count_comms(comm_df: pd.DataFrame, group_by=None) -> pd.DataFrame: comm_duration_max = ( comm_df.groupby(group_by + ["participant_id", "call_type"]) .max()["call_duration"] - .unstack() + .unstack(level="call_type", fill_value=0) .rename(columns=call_types) .add_prefix("duration_max_") ) @@ -208,7 +208,7 @@ def count_comms(comm_df: pd.DataFrame, group_by=None) -> pd.DataFrame: data_type = "sms" comm_counts = ( comm_df.value_counts(subset=group_by + ["participant_id", "message_type"]) - .unstack() + .unstack(level="message_type", fill_value=0) .rename(columns=sms_types) .add_prefix("no_") ) @@ -311,24 +311,34 @@ def calls_sms_features( group_by = [] count_calls = count_comms(df_calls, group_by) count_sms = count_comms(df_sms, group_by) - count_joined = count_calls.merge( - count_sms, how="outer", left_index=True, right_index=True, validate="one_to_one" - ).assign( - proportion_calls_all=( - lambda x: x.no_calls_all / (x.no_calls_all + x.no_sms_all) - ), - proportion_calls_incoming=( - lambda x: x.no_incoming / (x.no_incoming + x.no_received) - ), - proportion_calls_missed_sms_received=( - lambda x: x.no_missed / (x.no_missed + x.no_received) - ), - proportion_calls_outgoing=( - lambda x: x.no_outgoing / (x.no_outgoing + x.no_sent) - ), - proportion_calls_contacts=( - lambda x: x.no_contacts_calls / (x.no_contacts_calls + x.no_contacts_sms) + count_joined = ( + count_calls.merge( + count_sms, + how="outer", + left_index=True, + right_index=True, + validate="one_to_one", ) - # Calculate new features and create additional columns + .fillna(0, downcast="infer") + .assign( + proportion_calls_all=( + lambda x: x.no_calls_all / (x.no_calls_all + x.no_sms_all) + ), + proportion_calls_incoming=( + lambda x: x.no_incoming / (x.no_incoming + x.no_received) + ), + proportion_calls_missed_sms_received=( + lambda x: x.no_missed / (x.no_missed + x.no_received) + ), + proportion_calls_outgoing=( + lambda x: x.no_outgoing / (x.no_outgoing + x.no_sent) + ), + proportion_calls_contacts=( + lambda x: x.no_contacts_calls + / (x.no_contacts_calls + x.no_contacts_sms) + ) + # Calculate new features and create additional columns + ) + .fillna(0.5, downcast="infer") ) return count_joined