# --- # jupyter: # jupytext: # formats: ipynb,py:percent # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.11.4 # kernelspec: # display_name: straw2analysis # language: python # name: straw2analysis # --- # %% import importlib # %% # %matplotlib inline import os import sys import matplotlib.pyplot as plt import seaborn as sns nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) # %% from features import communication, helper # %% importlib.reload(communication) # %% [markdown] # # Example of communication data and feature calculation # %% df_calls = communication.get_call_data(["nokia_0000003"]) print(df_calls) # %% df_calls = helper.get_date_from_timestamp(df_calls) communication.count_comms(df_calls, ["date_lj"]) # %% df_sms = communication.get_sms_data(["nokia_0000003"]) df_sms = helper.get_date_from_timestamp(df_sms) communication.count_comms(df_sms, ["date_lj"]) # %% communication.calls_sms_features(df_calls, df_sms, ["date_lj"]) # %% [markdown] # # Call data # %% import participants.query_db # %% participants_inactive_usernames = participants.query_db.get_usernames() df_calls_inactive = get_call_data(participants_inactive_usernames) # %% participants_inactive_usernames # %% df_calls_inactive.head() # %% enumerate_contacts(df_calls_inactive).head() # %% df_calls_features = count_comms(df_calls_inactive) df_calls_features.head() # %% df_calls_features.describe() # %% calls_number = pd.wide_to_long( df_calls_features[["no_incoming", "no_outgoing", "no_missed"]].reset_index(), i="participant_id", j="call_type", stubnames="no", sep="_", suffix="\D+", ) # %% calls_number # %% sns.displot(calls_number, x="no", hue="call_type", binwidth=5, element="step", height=8) # %% calls_duration = pd.wide_to_long( df_calls_features[ ["duration_total_incoming", "duration_total_outgoing"] ].reset_index(), i="participant_id", j="call_type", stubnames="duration", sep="_", suffix="\D+", ) sns.displot( calls_duration, x="duration", hue="call_type", multiple="dodge", height=8, log_scale=(True, False), ) # %% [markdown] # ## Most frequent contacts by participant # %% df_calls_inactive = enumerate_contacts(df_calls_inactive) df_calls_inactive.tail() # %% df_calls_frequent = df_calls_inactive.query("contact_id < 5") # %% sns.boxplot(x="contact_id", y="freq", data=df_calls_frequent) # %% [markdown] # # SMS data # %% df_sms_inactive = get_sms_data(participants_inactive_usernames) df_sms_features = count_comms(df_sms_inactive) df_sms_features.describe() # %% sms_number = pd.wide_to_long( df_sms_features[["no_received", "no_sent"]].reset_index(), i="participant_id", j="message_type", stubnames="no", sep="_", suffix="\D+", ) sns.displot( sms_number, x="no", hue="message_type", binwidth=5, element="step", height=8 ) # %% [markdown] # # Communication features # %% df_calls_enumerated = enumerate_contacts(df_calls) display(df_calls_enumerated) # %% df_calls_contact_features = contact_features(df_calls_enumerated) display(df_calls_contact_features) # %% df_sms_enumerated = enumerate_contacts(df_sms) df_sms_contact_features = contact_features(df_sms_enumerated) display(df_sms_contact_features) # %% display(count_comms(df_calls)) # %% display(count_comms(df_sms)) # %% display(calls_sms_features(df_calls, df_sms)) # %%