diff --git a/exploration/expl_communication.py b/exploration/expl_communication.py index 0c464c0..a9787e6 100644 --- a/exploration/expl_communication.py +++ b/exploration/expl_communication.py @@ -13,14 +13,15 @@ # name: straw2analysis # --- +# %% +import importlib + # %% # %matplotlib inline import os import sys import matplotlib.pyplot as plt - -# %% import seaborn as sns nb_dir = os.path.split(os.getcwd())[0] @@ -28,21 +29,29 @@ if nb_dir not in sys.path: sys.path.append(nb_dir) # %% -from features.communication import * +from features import communication, helper + +# %% +importlib.reload(communication) # %% [markdown] # # Example of communication data and feature calculation # %% -df_calls = get_call_data(["nokia_0000003"]) +df_calls = communication.get_call_data(["nokia_0000003"]) print(df_calls) # %% -count_comms(df_calls) +df_calls = helper.get_date_from_timestamp(df_calls) +communication.count_comms(df_calls, ["date_lj"]) # %% -df_sms = get_sms_data(["nokia_0000003"]) -count_comms(df_sms) +df_sms = communication.get_sms_data(["nokia_0000003"]) +df_sms = helper.get_date_from_timestamp(df_sms) +communication.count_comms(df_sms, ["date_lj"]) + +# %% +communication.calls_sms_features(df_calls, df_sms, ["date_lj"]) # %% [markdown] # # Call data diff --git a/features/communication.py b/features/communication.py index a858830..209eabd 100644 --- a/features/communication.py +++ b/features/communication.py @@ -137,7 +137,7 @@ def enumerate_contacts(comm_df: pd.DataFrame) -> pd.DataFrame: return comm_df -def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame: +def count_comms(comm_df: pd.DataFrame, group_by=None) -> pd.DataFrame: """ Calculate frequencies (and duration) of messages (or calls), grouped by their types. @@ -145,6 +145,9 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame: ---------- comm_df: pd.DataFrame A dataframe of calls or SMSes. + group_by: list + A list of strings, specifying by which parameters to group. + By default, the features are calculated per participant, but could be "date_lj" etc. Returns ------- @@ -157,10 +160,12 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame: * the number of messages by type (received, sent), and * the number of communication contacts by type. """ + if group_by is None: + group_by = [] if "call_type" in comm_df: data_type = "calls" comm_counts = ( - comm_df.value_counts(subset=["participant_id", "call_type"]) + comm_df.value_counts(subset=group_by + ["participant_id", "call_type"]) .unstack() .rename(columns=call_types) .add_prefix("no_") @@ -174,7 +179,7 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame: ) # Ratio of incoming and outgoing calls to all calls. comm_duration_total = ( - comm_df.groupby(["participant_id", "call_type"]) + comm_df.groupby(group_by + ["participant_id", "call_type"]) .sum()["call_duration"] .unstack() .rename(columns=call_types) @@ -182,7 +187,7 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame: ) # Total call duration by type. comm_duration_max = ( - comm_df.groupby(["participant_id", "call_type"]) + comm_df.groupby(group_by + ["participant_id", "call_type"]) .max()["call_duration"] .unstack() .rename(columns=call_types) @@ -202,7 +207,7 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame: elif "message_type" in comm_df: data_type = "sms" comm_counts = ( - comm_df.value_counts(subset=["participant_id", "message_type"]) + comm_df.value_counts(subset=group_by + ["participant_id", "message_type"]) .unstack() .rename(columns=sms_types) .add_prefix("no_") @@ -218,7 +223,7 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame: raise KeyError("The dataframe contains neither call_type or message_type") comm_contacts_counts = ( enumerate_contacts(comm_df) - .groupby(["participant_id"]) + .groupby(group_by + ["participant_id"]) .nunique()["contact_id"] .rename("no_contacts_" + data_type) ) @@ -270,7 +275,9 @@ def contact_features(comm_df: pd.DataFrame) -> pd.DataFrame: return contacts_count -def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataFrame: +def calls_sms_features( + df_calls: pd.DataFrame, df_sms: pd.DataFrame, group_by=None +) -> pd.DataFrame: """ Calculates additional features relating calls and sms data. @@ -280,6 +287,9 @@ def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataF A dataframe of calls (return of get_call_data). df_sms: pd.DataFrame A dataframe of SMSes (return of get_sms_data). + group_by: list + A list of strings, specifying by which parameters to group. + By default, the features are calculated per participant, but could be "date_lj" etc. Returns ------- @@ -297,9 +307,13 @@ def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataF * proportion_calls_contacts: proportion of calls contacts in total number of communication contacts """ - count_calls = count_comms(df_calls) - count_sms = count_comms(df_sms) - count_joined = count_calls.join(count_sms).assign( + if group_by is None: + group_by = [] + count_calls = count_comms(df_calls, group_by) + count_sms = count_comms(df_sms, group_by) + count_joined = count_calls.merge( + count_sms, how="outer", left_index=True, right_index=True, validate="one_to_one" + ).assign( proportion_calls_all=( lambda x: x.no_calls_all / (x.no_calls_all + x.no_sms_all) ),