Merge branch 'master' into ml_pipeline

rapids
junos 2021-08-18 17:30:36 +02:00
commit de92e1309d
3 changed files with 109 additions and 105 deletions

View File

@ -8,6 +8,43 @@ from setup import db_engine, session
call_types = {1: "incoming", 2: "outgoing", 3: "missed"}
sms_types = {1: "received", 2: "sent"}
FEATURES_CALLS = (
["no_calls_all"]
+ ["no_" + call_type for call_type in call_types.values()]
+ ["duration_total_" + call_types.get(1), "duration_total_" + call_types.get(2)]
+ ["duration_max_" + call_types.get(1), "duration_max_" + call_types.get(2)]
+ ["no_" + call_types.get(1) + "_ratio", "no_" + call_types.get(2) + "_ratio"]
+ ["no_contacts_calls"]
)
# FEATURES_CALLS =
# ["no_calls_all",
# "no_incoming", "no_outgoing", "no_missed",
# "duration_total_incoming", "duration_total_outgoing",
# "duration_max_incoming", "duration_max_outgoing",
# "no_incoming_ratio", "no_outgoing_ratio",
# "no_contacts"]
FEATURES_SMS = (
["no_sms_all"]
+ ["no_" + sms_type for sms_type in sms_types.values()]
+ ["no_" + sms_types.get(1) + "_ratio", "no_" + sms_types.get(2) + "_ratio"]
+ ["no_contacts_sms"]
)
# FEATURES_SMS =
# ["no_sms_all",
# "no_received", "no_sent",
# "no_received_ratio", "no_sent_ratio",
# "no_contacts"]
FEATURES_CONTACT = [
"proportion_calls_all",
"proportion_calls_incoming",
"proportion_calls_outgoing",
"proportion_calls_contacts",
"proportion_calls_missed_sms_received",
]
def get_call_data(usernames: Collection) -> pd.DataFrame:
"""
@ -114,10 +151,12 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame:
These are:
* the number of calls by type (incoming, outgoing missed) and in total,
* the ratio of incoming and outgoing calls to the total number of calls,
* the total and maximum duration of calls by type, and
* the number of messages by type (received, sent).
* the total and maximum duration of calls by type,
* the number of messages by type (received, sent), and
* the number of communication contacts by type.
"""
if "call_type" in comm_df:
data_type = "calls"
comm_counts = (
comm_df.value_counts(subset=["participant_id", "call_type"])
.unstack()
@ -125,11 +164,11 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame:
.add_prefix("no_")
)
# Count calls by type.
comm_counts["no_all"] = comm_counts.sum(axis=1)
comm_counts["no_calls_all"] = comm_counts.sum(axis=1)
# Add a total count of calls.
comm_counts = comm_counts.assign(
no_incoming_ratio=lambda x: x.no_incoming / x.no_all,
no_outgoing_ratio=lambda x: x.no_outgoing / x.no_all,
no_incoming_ratio=lambda x: x.no_incoming / x.no_calls_all,
no_outgoing_ratio=lambda x: x.no_outgoing / x.no_calls_all,
)
# Ratio of incoming and outgoing calls to all calls.
comm_duration_total = (
@ -159,44 +198,56 @@ def count_comms(comm_df: pd.DataFrame) -> pd.DataFrame:
# If there were no missed calls, this exception is raised.
# But we are dropping the column anyway, so no need to deal with the exception.
elif "message_type" in comm_df:
data_type = "sms"
comm_counts = (
comm_df.value_counts(subset=["participant_id", "message_type"])
.unstack()
.rename(columns=sms_types)
.add_prefix("no_")
)
comm_counts["no_all"] = comm_counts.sum(axis=1)
comm_counts["no_sms_all"] = comm_counts.sum(axis=1)
# Add a total count of messages.
comm_features = comm_counts.assign(
no_received_ratio=lambda x: x.no_received / x.no_all,
no_sent_ratio=lambda x: x.no_sent / x.no_all,
no_received_ratio=lambda x: x.no_received / x.no_sms_all,
no_sent_ratio=lambda x: x.no_sent / x.no_sms_all,
)
# Ratio of incoming and outgoing messages to all messages.
else:
raise KeyError("The dataframe contains neither call_type or message_type")
comm_contacts_counts = (
enumerate_contacts(comm_df)
.groupby(["participant_id"])
.nunique()["contact_id"]
.rename("no_contacts_" + data_type)
)
# Number of communication contacts
comm_features = comm_features.join(comm_contacts_counts)
return comm_features
def contact_features(df_enumerated: pd.DataFrame) -> pd.DataFrame:
def contact_features(comm_df: pd.DataFrame) -> pd.DataFrame:
"""
Counts the number of people contacted (for each participant) and, if
df_enumerated is a dataframe containing calls data, the total duration
of calls between a participant and each of her contacts.
For each participant and for each of his contacts, this function
counts the number of communications (by type) between them. If the
argument passed is a dataframe with calls data, it additionally counts
the total duration of calls between every pair (participant, contact).
Parameters
----------
df_enumerated: pd.DataFrame
A dataframe of calls or SMSes; return of function enumerate_contacts.
comm_df: pd.DataFrame
A dataframe of calls or SMSes.
Returns
-------
comm_df: pd.DataFrame
The altered dataframe with the column no_contacts and, if df_enumerated
contains calls data, an additional column total_call_duration.
A new dataframe with a row for each pair (participant, contact).
"""
df_enumerated = enumerate_contacts(comm_df)
contacts_count = (
df_enumerated.groupby(["participant_id", "contact_id"]).size().reset_index()
)
# Check whether df contains calls or SMS data since some
# features we want to calculate are type-specyfic
# features we want to calculate are type-specific
if "call_duration" in df_enumerated:
# Add a column with the total duration of calls between two people
duration_count = (
@ -207,33 +258,14 @@ def contact_features(df_enumerated: pd.DataFrame) -> pd.DataFrame:
.reset_index() # Make index (which is actually the participant id) a normal column
.rename(columns={"call_duration": "total_call_duration"})
)
# The new dataframe now contains columns containing information about
# participants, callers and the total duration of their calls. All that
# is now left to do is to merge the original df with the new one.
df_enumerated = df_enumerated.merge(
contacts_count = contacts_count.merge(
duration_count, on=["participant_id", "contact_id"]
)
contact_count = (
df_enumerated.groupby(["participant_id"])
.nunique()[
"contact_id"
] # For each participant, count the number of distinct contacts
.reset_index() # Make index (which is actually the participant id) a normal column
.rename(columns={"contact_id": "no_contacts"})
)
df_enumerated = (
# Merge df with the newely created df containing info about number of contacts
df_enumerated.merge(contact_count, on="participant_id")
# Sort first by participant_id and then by contact_id and
# thereby restore the inital ordering of input dataframes.
.sort_values(["participant_id", "contact_id"])
)
contacts_count.rename(columns={0: "no_calls"}, inplace=True)
else:
contacts_count.rename(columns={0: "no_sms"}, inplace=True)
# TODO:Determine work vs non-work contacts by work hours heuristics
return df_enumerated
return contacts_count
def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataFrame:
@ -245,14 +277,14 @@ def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataF
df_calls: pd.DataFrame
A dataframe of calls (return of get_call_data).
df_sms: pd.DataFrame
A dataframe of calls (return of get_sms_data).
A dataframe of SMSes (return of get_sms_data).
Returns
-------
df_calls_sms: pd.DataFrame
The list of features relating calls and sms data for every participant.
These are:
* proportion_calls:
* proportion_calls_all:
proportion of calls in total number of communications
* proportion_calls_incoming:
proportion of incoming calls in total number of incoming/received communications
@ -263,62 +295,24 @@ def calls_sms_features(df_calls: pd.DataFrame, df_sms: pd.DataFrame) -> pd.DataF
* proportion_calls_contacts:
proportion of calls contacts in total number of communication contacts
"""
count_calls = count_comms(df_calls)
count_sms = count_comms(df_sms)
count_joined = (
count_calls.merge(
count_sms, on="participant_id", suffixes=("_calls", "_sms")
) # Merge calls and sms features
.reset_index() # Make participant_id a regular column
.assign(
proportion_calls=(
lambda x: x.no_all_calls / (x.no_all_calls + x.no_all_sms)
),
proportion_calls_incoming=(
lambda x: x.no_incoming / (x.no_incoming + x.no_received)
),
proportion_calls_missed_sms_received=(
lambda x: x.no_missed / (x.no_missed + x.no_received)
),
proportion_calls_outgoing=(
lambda x: x.no_outgoing / (x.no_outgoing + x.no_sent)
)
# Calculate new features and create additional columns
)[
[
"participant_id",
"proportion_calls",
"proportion_calls_incoming",
"proportion_calls_outgoing",
"proportion_calls_missed_sms_received",
]
] # Filter out only the relevant features
count_joined = count_calls.join(count_sms).assign(
proportion_calls_all=(
lambda x: x.no_calls_all / (x.no_calls_all + x.no_sms_all)
),
proportion_calls_incoming=(
lambda x: x.no_incoming / (x.no_incoming + x.no_received)
),
proportion_calls_missed_sms_received=(
lambda x: x.no_missed / (x.no_missed + x.no_received)
),
proportion_calls_outgoing=(
lambda x: x.no_outgoing / (x.no_outgoing + x.no_sent)
),
proportion_calls_contacts=(
lambda x: x.no_contacts_calls / (x.no_contacts_calls + x.no_contacts_sms)
)
# Calculate new features and create additional columns
)
features_calls = contact_features(enumerate_contacts(df_calls))
features_sms = contact_features(enumerate_contacts(df_sms))
features_joined = (
features_calls.merge(
features_sms, on="participant_id", suffixes=("_calls", "_sms")
) # Merge calls and sms features
.reset_index() # Make participant_id a regular column
.assign(
proportion_calls_contacts=(
lambda x: x.no_contacts_calls
/ (x.no_contacts_calls + x.no_contacts_sms)
) # Calculate new features and create additional columns
)[
["participant_id", "proportion_calls_contacts"]
] # Filter out only the relevant features
# Since we are interested only in some features and ignored
# others, a lot of duplicate rows were created. Remove them.
.drop_duplicates()
)
# Join the newly created dataframes
df_calls_sms = count_joined.merge(features_joined, on="participant_id")
return df_calls_sms
return count_joined

View File

@ -38,8 +38,10 @@ def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
# - OFF -> ON -> unlocked (a true phone unlock)
# - OFF -> ON -> OFF/locked (no unlocking, i.e. a screen status check)
# Consider that screen data is sometimes unreliable as shown in expl_screen.ipynb:
# "I have also seen off -> on -> unlocked (with 2 - locked missing)
# and off -> locked -> on -> off -> locked (*again*)."
# "I have also seen
# off -> on -> unlocked (with 2 - locked missing)
# and
# off -> locked -> on -> off -> locked (*again*)."
# Either clean the data beforehand or deal with these inconsistencies in this function.
pass

View File

@ -5,7 +5,7 @@ import pandas as pd
from numpy.random import default_rng
from pandas.testing import assert_series_equal
from features.communication import count_comms, enumerate_contacts, get_call_data
from features.communication import *
rng = default_rng()
@ -76,10 +76,18 @@ class CallsFeatures(unittest.TestCase):
def test_count_comms_calls(self):
self.features = count_comms(self.calls)
print(self.features)
self.assertIsInstance(self.features, pd.DataFrame)
self.assertCountEqual(self.features.columns.to_list(), FEATURES_CALLS)
def test_count_comms_sms(self):
self.features = count_comms(self.sms)
print(self.features)
self.assertIsInstance(self.features, pd.DataFrame)
self.assertCountEqual(self.features.columns.to_list(), FEATURES_SMS)
def test_calls_sms_features(self):
self.features_call_sms = calls_sms_features(self.calls, self.sms)
self.assertIsInstance(self.features_call_sms, pd.DataFrame)
self.assertCountEqual(
self.features_call_sms.columns.to_list(),
FEATURES_CALLS + FEATURES_SMS + FEATURES_CONTACT,
)