separated features

communication
Ivan Kobe 2021-08-10 12:34:21 +02:00
parent 06e1fe7410
commit 74b4f9ddbe
2 changed files with 58 additions and 68 deletions

View File

@ -114,8 +114,9 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame:
These are:
* the number of calls by type (incoming, outgoing missed) and in total,
* the ratio of incoming and outgoing calls to the total number of calls,
* the total and maximum duration of calls by type, and
* the number of messages by type (received, sent).
* the total and maximum duration of calls by type,
* the number of messages by type (received, sent), and
* the number of communication contacts by type.
"""
if "call_type" in comm_df:
comm_counts = (
@ -148,8 +149,20 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame:
.add_prefix("duration_max_")
)
# Max call duration by type
comm_contacts_counts = (
enumerate_contacts(comm_df)
.groupby(["participant_id"])
.nunique()["contact_id"]
.reset_index()
.rename(columns={"contact_id": "no_contacts"})
)
# Number of communication contacts
comm_features = comm_counts.join(comm_duration_total)
comm_features = comm_features.join(comm_duration_max)
comm_features = comm_features.merge(
comm_contacts_counts,
on="participant_id"
).set_index("participant_id")
try:
comm_features.drop(columns="duration_total_" + call_types[3], inplace=True)
comm_features.drop(columns="duration_max_" + call_types[3], inplace=True)
@ -172,68 +185,66 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame:
no_sent_ratio=lambda x: x.no_sent / x.no_all,
)
# Ratio of incoming and outgoing messages to all messages.
comm_contacts_counts = (
enumerate_contacts(comm_df)
.groupby(["participant_id"])
.nunique()["contact_id"]
.reset_index()
.rename(columns={"contact_id": "no_contacts"})
)
# Number of communication contacts
comm_features = comm_features.merge(
comm_contacts_counts,
on="participant_id"
).set_index("participant_id")
else:
raise KeyError("The dataframe contains neither call_type or message_type")
return comm_features
def contact_features(df_enumerated: pd.DataFrame) -> pd.DataFrame:
def contact_features(comm_df: pd.DataFrame) -> pd.DataFrame:
"""
Counts the number of people contacted (for each participant) and, if
df_enumerated is a dataframe containing calls data, the total duration
of calls between a participant and each of her contacts.
For each participant and for each of his contacts, this function
counts the number of communications (by type) between them. If the
argument passed is a dataframe with calls data, it additionally counts
the total duration of calls between every pair (participant, contact).
Parameters
----------
df_enumerated: pd.DataFrame
A dataframe of calls or SMSes; return of function enumerate_contacts.
A dataframe of calls or SMSes.
Returns
-------
comm_df: pd.DataFrame
The altered dataframe with the column no_contacts and, if df_enumerated
contains calls data, an additional column total_call_duration.
A new dataframe with a row for each pair (participant, contact).
"""
df_enumerated = enumerate_contacts(comm_df)
contacts_count = (
df_enumerated
.groupby(["participant_id","contact_id"])
.size()
.reset_index()
)
# Check whether df contains calls or SMS data since some
# features we want to calculate are type-specyfic
if "call_duration" in df_enumerated:
# Add a column with the total duration of calls between two people
duration_count = (
df_enumerated.groupby(["participant_id", "contact_id"])
df_enumerated
.groupby(["participant_id", "contact_id"])
# For each participant and for each caller, sum durations of their calls
["call_duration"]
.sum()
.reset_index() # Make index (which is actually the participant id) a normal column
.rename(columns={"call_duration": "total_call_duration"})
)
# The new dataframe now contains columns containing information about
# participants, callers and the total duration of their calls. All that
# is now left to do is to merge the original df with the new one.
df_enumerated = df_enumerated.merge(
duration_count, on=["participant_id", "contact_id"]
)
contact_count = (
df_enumerated.groupby(["participant_id"])
.nunique()[
"contact_id"
] # For each participant, count the number of distinct contacts
.reset_index() # Make index (which is actually the participant id) a normal column
.rename(columns={"contact_id": "no_contacts"})
)
df_enumerated = (
# Merge df with the newely created df containing info about number of contacts
df_enumerated.merge(contact_count, on="participant_id")
# Sort first by participant_id and then by contact_id and
# thereby restore the inital ordering of input dataframes.
.sort_values(["participant_id", "contact_id"])
)
contacts_count = contacts_count.merge(duration_count, on=["participant_id", "contact_id"])
contacts_count.rename(columns={0:"no_calls"}, inplace=True)
else:
contacts_count.rename(columns={0:"no_sms"}, inplace=True)
# TODO:Determine work vs non-work contacts by work hours heuristics
return df_enumerated
return contacts_count
def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataFrame:
@ -245,7 +256,7 @@ def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataF
df_calls: pd.DataFrame
A dataframe of calls (return of get_call_data).
df_sms: pd.DataFrame
A dataframe of calls (return of get_sms_data).
A dataframe of SMSes (return of get_sms_data).
Returns
-------
@ -263,10 +274,8 @@ def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataF
* proportion_calls_contacts:
proportion of calls contacts in total number of communication contacts
"""
count_calls = count_comms(df_calls)
count_sms = count_comms(df_sms)
count_joined = (
count_calls.merge(
count_sms, on="participant_id", suffixes=("_calls", "_sms")
@ -284,6 +293,9 @@ def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataF
),
proportion_calls_outgoing=(
lambda x: x.no_outgoing / (x.no_outgoing + x.no_sent)
),
proportion_calls_contacts=(
lambda x: x.no_contacts_calls / (x.no_contacts_calls + x.no_contacts_sms)
)
# Calculate new features and create additional columns
)[
@ -292,33 +304,9 @@ def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataF
"proportion_calls",
"proportion_calls_incoming",
"proportion_calls_outgoing",
"proportion_calls_contacts",
"proportion_calls_missed_sms_received",
]
] # Filter out only the relevant features
)
features_calls = contact_features(enumerate_contacts(df_calls))
features_sms = contact_features(enumerate_contacts(df_sms))
features_joined = (
features_calls.merge(
features_sms, on="participant_id", suffixes=("_calls", "_sms")
) # Merge calls and sms features
.reset_index() # Make participant_id a regular column
.assign(
proportion_calls_contacts=(
lambda x: x.no_contacts_calls
/ (x.no_contacts_calls + x.no_contacts_sms)
) # Calculate new features and create additional columns
)[
["participant_id", "proportion_calls_contacts"]
] # Filter out only the relevant features
# Since we are interested only in some features and ignored
# others, a lot of duplicate rows were created. Remove them.
.drop_duplicates()
)
# Join the newly created dataframes
df_calls_sms = count_joined.merge(features_joined, on="participant_id")
return df_calls_sms
return count_joined

View File

@ -38,8 +38,10 @@ def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
# - OFF -> ON -> unlocked (a true phone unlock)
# - OFF -> ON -> OFF/locked (no unlocking, i.e. a screen status check)
# Consider that screen data is sometimes unreliable as shown in expl_screen.ipynb:
# "I have also seen off -> on -> unlocked (with 2 - locked missing)
# and off -> locked -> on -> off -> locked (*again*)."
# "I have also seen
# off -> on -> unlocked (with 2 - locked missing)
# and
# off -> locked -> on -> off -> locked (*again*)."
# Either clean the data beforehand or deal with these inconsistencies in this function.
pass