Merge branch 'master' into ml_pipeline

rapids
junos 2021-08-21 17:37:45 +02:00
commit 0b85ee8fdc
1 changed files with 32 additions and 22 deletions

View File

@ -166,7 +166,7 @@ def count_comms(comm_df: pd.DataFrame, group_by=None) -> pd.DataFrame:
data_type = "calls" data_type = "calls"
comm_counts = ( comm_counts = (
comm_df.value_counts(subset=group_by + ["participant_id", "call_type"]) comm_df.value_counts(subset=group_by + ["participant_id", "call_type"])
.unstack() .unstack(level="call_type", fill_value=0)
.rename(columns=call_types) .rename(columns=call_types)
.add_prefix("no_") .add_prefix("no_")
) )
@ -181,7 +181,7 @@ def count_comms(comm_df: pd.DataFrame, group_by=None) -> pd.DataFrame:
comm_duration_total = ( comm_duration_total = (
comm_df.groupby(group_by + ["participant_id", "call_type"]) comm_df.groupby(group_by + ["participant_id", "call_type"])
.sum()["call_duration"] .sum()["call_duration"]
.unstack() .unstack(level="call_type", fill_value=0)
.rename(columns=call_types) .rename(columns=call_types)
.add_prefix("duration_total_") .add_prefix("duration_total_")
) )
@ -189,7 +189,7 @@ def count_comms(comm_df: pd.DataFrame, group_by=None) -> pd.DataFrame:
comm_duration_max = ( comm_duration_max = (
comm_df.groupby(group_by + ["participant_id", "call_type"]) comm_df.groupby(group_by + ["participant_id", "call_type"])
.max()["call_duration"] .max()["call_duration"]
.unstack() .unstack(level="call_type", fill_value=0)
.rename(columns=call_types) .rename(columns=call_types)
.add_prefix("duration_max_") .add_prefix("duration_max_")
) )
@ -208,7 +208,7 @@ def count_comms(comm_df: pd.DataFrame, group_by=None) -> pd.DataFrame:
data_type = "sms" data_type = "sms"
comm_counts = ( comm_counts = (
comm_df.value_counts(subset=group_by + ["participant_id", "message_type"]) comm_df.value_counts(subset=group_by + ["participant_id", "message_type"])
.unstack() .unstack(level="message_type", fill_value=0)
.rename(columns=sms_types) .rename(columns=sms_types)
.add_prefix("no_") .add_prefix("no_")
) )
@ -311,24 +311,34 @@ def calls_sms_features(
group_by = [] group_by = []
count_calls = count_comms(df_calls, group_by) count_calls = count_comms(df_calls, group_by)
count_sms = count_comms(df_sms, group_by) count_sms = count_comms(df_sms, group_by)
count_joined = count_calls.merge( count_joined = (
count_sms, how="outer", left_index=True, right_index=True, validate="one_to_one" count_calls.merge(
).assign( count_sms,
proportion_calls_all=( how="outer",
lambda x: x.no_calls_all / (x.no_calls_all + x.no_sms_all) left_index=True,
), right_index=True,
proportion_calls_incoming=( validate="one_to_one",
lambda x: x.no_incoming / (x.no_incoming + x.no_received)
),
proportion_calls_missed_sms_received=(
lambda x: x.no_missed / (x.no_missed + x.no_received)
),
proportion_calls_outgoing=(
lambda x: x.no_outgoing / (x.no_outgoing + x.no_sent)
),
proportion_calls_contacts=(
lambda x: x.no_contacts_calls / (x.no_contacts_calls + x.no_contacts_sms)
) )
# Calculate new features and create additional columns .fillna(0, downcast="infer")
.assign(
proportion_calls_all=(
lambda x: x.no_calls_all / (x.no_calls_all + x.no_sms_all)
),
proportion_calls_incoming=(
lambda x: x.no_incoming / (x.no_incoming + x.no_received)
),
proportion_calls_missed_sms_received=(
lambda x: x.no_missed / (x.no_missed + x.no_received)
),
proportion_calls_outgoing=(
lambda x: x.no_outgoing / (x.no_outgoing + x.no_sent)
),
proportion_calls_contacts=(
lambda x: x.no_contacts_calls
/ (x.no_contacts_calls + x.no_contacts_sms)
)
# Calculate new features and create additional columns
)
.fillna(0.5, downcast="infer")
) )
return count_joined return count_joined