2020-06-19 07:27:28 +02:00
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
def base_conversation_features(conversation_data, day_segment, requested_features,recordingMinutes,pausedMinutes,expectedMinutes):
|
|
|
|
# name of the features this function can compute
|
|
|
|
base_features_names = ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration",
|
2020-08-23 19:20:19 +02:00
|
|
|
"sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","noisesumenergy",
|
|
|
|
"noiseavgenergy","noisesdenergy","noiseminenergy","noisemaxenergy","voicesumenergy",
|
|
|
|
"voiceavgenergy","voicesdenergy","voiceminenergy","voicemaxenergy","silencesensedfraction","noisesensedfraction",
|
2020-06-19 07:27:28 +02:00
|
|
|
"voicesensedfraction","unknownsensedfraction","silenceexpectedfraction","noiseexpectedfraction","voiceexpectedfraction",
|
2020-06-24 23:02:47 +02:00
|
|
|
"unknownexpectedfraction","countconversation"]
|
2020-06-19 07:27:28 +02:00
|
|
|
|
|
|
|
# the subset of requested features this function can compute
|
|
|
|
features_to_compute = list(set(requested_features) & set(base_features_names))
|
|
|
|
|
2020-06-26 17:25:25 +02:00
|
|
|
conversation_features = pd.DataFrame(columns=["local_date"] + ["conversation_" + day_segment + "_" + x for x in features_to_compute])
|
|
|
|
if not conversation_data.empty:
|
2020-06-19 07:27:28 +02:00
|
|
|
if day_segment != "daily":
|
|
|
|
conversation_data = conversation_data[conversation_data["local_day_segment"] == day_segment]
|
|
|
|
|
2020-06-26 17:25:25 +02:00
|
|
|
if not conversation_data.empty:
|
2020-06-19 07:27:28 +02:00
|
|
|
conversation_features = pd.DataFrame()
|
|
|
|
|
2020-08-29 00:28:00 +02:00
|
|
|
conversation_data = conversation_data.drop_duplicates(subset=['local_date','local_time'], keep="first")
|
2020-06-24 23:02:47 +02:00
|
|
|
|
2020-06-19 07:27:28 +02:00
|
|
|
if "minutessilence" in features_to_compute:
|
2020-06-24 23:02:47 +02:00
|
|
|
conversation_features["conversation_" + day_segment + "_minutessilence"] = conversation_data[conversation_data['inference']==0].groupby(["local_date"])['inference'].count()/60
|
2020-06-19 07:27:28 +02:00
|
|
|
|
|
|
|
if "minutesnoise" in features_to_compute:
|
2020-06-24 23:02:47 +02:00
|
|
|
conversation_features["conversation_" + day_segment + "_minutesnoise"] = conversation_data[conversation_data['inference']==1].groupby(["local_date"])['inference'].count()/60
|
2020-06-19 07:27:28 +02:00
|
|
|
|
|
|
|
if "minutesvoice" in features_to_compute:
|
2020-06-24 23:02:47 +02:00
|
|
|
conversation_features["conversation_" + day_segment + "_minutesvoice"] = conversation_data[conversation_data['inference']==2].groupby(["local_date"])['inference'].count()/60
|
2020-06-19 07:27:28 +02:00
|
|
|
|
|
|
|
if "minutesunknown" in features_to_compute:
|
2020-06-24 23:02:47 +02:00
|
|
|
conversation_features["conversation_" + day_segment + "_minutesunknown"] = conversation_data[conversation_data['inference']==3].groupby(["local_date"])['inference'].count()/60
|
2020-06-19 07:27:28 +02:00
|
|
|
|
2020-06-24 23:02:47 +02:00
|
|
|
if "countconversation" in features_to_compute:
|
|
|
|
conversation_features["conversation_" + day_segment + "_countconversation"] = conversation_data[conversation_data["double_convo_start"] > 0].groupby(["local_date"])['double_convo_start'].nunique()
|
2020-06-23 20:12:38 +02:00
|
|
|
|
2020-07-20 22:52:21 +02:00
|
|
|
conv_duration = (conversation_data['double_convo_end']/1000 - conversation_data['double_convo_start']/1000)/60
|
2020-06-24 23:02:47 +02:00
|
|
|
conversation_data = conversation_data.assign(conv_duration = conv_duration.values)
|
|
|
|
|
2020-08-14 00:36:38 +02:00
|
|
|
conv_totalDuration = conversation_data[(conversation_data['inference'] >= 0) & (conversation_data['inference'] < 4)].groupby(["local_date"])['inference'].count()/60
|
2020-06-19 07:27:28 +02:00
|
|
|
|
|
|
|
if "silencesensedfraction" in features_to_compute:
|
2020-08-14 00:36:38 +02:00
|
|
|
conversation_features["conversation_" + day_segment + "_silencesensedfraction"] = (conversation_data[conversation_data['inference']==0].groupby(["local_date"])['inference'].count()/60)/ conv_totalDuration
|
2020-06-19 07:27:28 +02:00
|
|
|
|
|
|
|
if "noisesensedfraction" in features_to_compute:
|
2020-08-14 00:36:38 +02:00
|
|
|
conversation_features["conversation_" + day_segment + "_noisesensedfraction"] = (conversation_data[conversation_data['inference']==1].groupby(["local_date"])['inference'].count()/60)/ conv_totalDuration
|
2020-06-19 07:27:28 +02:00
|
|
|
|
|
|
|
if "voicesensedfraction" in features_to_compute:
|
2020-08-14 00:36:38 +02:00
|
|
|
conversation_features["conversation_" + day_segment + "_voicesensedfraction"] = (conversation_data[conversation_data['inference']==2].groupby(["local_date"])['inference'].count()/60)/ conv_totalDuration
|
2020-06-19 07:27:28 +02:00
|
|
|
|
|
|
|
if "unknownsensedfraction" in features_to_compute:
|
2020-08-14 00:36:38 +02:00
|
|
|
conversation_features["conversation_" + day_segment + "_unknownsensedfraction"] = (conversation_data[conversation_data['inference']==3].groupby(["local_date"])['inference'].count()/60)/ conv_totalDuration
|
2020-06-19 07:27:28 +02:00
|
|
|
|
|
|
|
if "silenceexpectedfraction" in features_to_compute:
|
2020-06-24 23:02:47 +02:00
|
|
|
conversation_features["conversation_" + day_segment + "_silenceexpectedfraction"] = (conversation_data[conversation_data['inference']==0].groupby(["local_date"])['inference'].count()/60)/ expectedMinutes
|
2020-06-19 07:27:28 +02:00
|
|
|
|
|
|
|
if "noiseexpectedfraction" in features_to_compute:
|
2020-06-24 23:02:47 +02:00
|
|
|
conversation_features["conversation_" + day_segment + "_noiseexpectedfraction"] = (conversation_data[conversation_data['inference']==1].groupby(["local_date"])['inference'].count()/60)/ expectedMinutes
|
2020-06-19 07:27:28 +02:00
|
|
|
|
|
|
|
if "voiceexpectedfraction" in features_to_compute:
|
2020-06-24 23:02:47 +02:00
|
|
|
conversation_features["conversation_" + day_segment + "_voiceexpectedfraction"] = (conversation_data[conversation_data['inference']==2].groupby(["local_date"])['inference'].count()/60)/ expectedMinutes
|
2020-06-19 07:27:28 +02:00
|
|
|
|
|
|
|
if "unknownexpectedfraction" in features_to_compute:
|
2020-06-24 23:02:47 +02:00
|
|
|
conversation_features["conversation_" + day_segment + "_unknownexpectedfraction"] = (conversation_data[conversation_data['inference']==3].groupby(["local_date"])['inference'].count()/60)/ expectedMinutes
|
2020-06-19 07:27:28 +02:00
|
|
|
|
|
|
|
if "sumconversationduration" in features_to_compute:
|
2020-06-23 20:12:38 +02:00
|
|
|
conversation_features["conversation_" + day_segment + "_sumconversationduration"] = conversation_data.groupby(["local_date"])["conv_duration"].sum()
|
2020-06-19 07:27:28 +02:00
|
|
|
|
|
|
|
if "avgconversationduration" in features_to_compute:
|
2020-08-12 02:24:21 +02:00
|
|
|
conversation_features["conversation_" + day_segment + "_avgconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_date"])["conv_duration"].mean()
|
2020-06-19 07:27:28 +02:00
|
|
|
|
|
|
|
if "sdconversationduration" in features_to_compute:
|
2020-08-12 02:24:21 +02:00
|
|
|
conversation_features["conversation_" + day_segment + "_sdconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_date"])["conv_duration"].std()
|
2020-06-19 07:27:28 +02:00
|
|
|
|
|
|
|
if "minconversationduration" in features_to_compute:
|
2020-08-12 02:24:21 +02:00
|
|
|
conversation_features["conversation_" + day_segment + "_minconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_date"])["conv_duration"].min()
|
2020-06-19 07:27:28 +02:00
|
|
|
|
|
|
|
if "maxconversationduration" in features_to_compute:
|
2020-06-23 20:12:38 +02:00
|
|
|
conversation_features["conversation_" + day_segment + "_maxconversationduration"] = conversation_data.groupby(["local_date"])["conv_duration"].max()
|
2020-06-19 07:27:28 +02:00
|
|
|
|
|
|
|
if "timefirstconversation" in features_to_compute:
|
2020-08-13 18:45:08 +02:00
|
|
|
timeFirstConversation = conversation_data[conversation_data["double_convo_start"] > 0].groupby(["local_date"])['local_time'].min()
|
|
|
|
if len(list(timeFirstConversation.index)) > 0:
|
|
|
|
for date in list(timeFirstConversation.index):
|
|
|
|
conversation_features.loc[date,"conversation_" + day_segment + "_timefirstconversation"] = int(timeFirstConversation.loc[date].split(':')[0])*60 + int(timeFirstConversation.loc[date].split(':')[1])
|
2020-06-21 22:49:29 +02:00
|
|
|
else:
|
|
|
|
conversation_features["conversation_" + day_segment + "_timefirstconversation"] = 0
|
2020-06-19 07:27:28 +02:00
|
|
|
|
|
|
|
if "timelastconversation" in features_to_compute:
|
2020-08-13 18:45:08 +02:00
|
|
|
timeLastConversation = conversation_data[conversation_data["double_convo_start"] > 0].groupby(["local_date"])['local_time'].max()
|
|
|
|
if len(list(timeLastConversation.index)) > 0:
|
|
|
|
for date in list(timeLastConversation.index):
|
|
|
|
conversation_features.loc[date,"conversation_" + day_segment + "_timelastconversation"] = int(timeLastConversation.loc[date].split(':')[0])*60 + int(timeLastConversation.loc[date].split(':')[1])
|
2020-06-21 22:49:29 +02:00
|
|
|
else:
|
|
|
|
conversation_features["conversation_" + day_segment + "_timelastconversation"] = 0
|
2020-08-13 18:45:08 +02:00
|
|
|
|
2020-08-23 19:20:19 +02:00
|
|
|
if "noisesumenergy" in features_to_compute:
|
|
|
|
conversation_features["conversation_" + day_segment + "_noisesumenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_date"])["double_energy"].sum()
|
2020-06-19 07:27:28 +02:00
|
|
|
|
2020-08-23 19:20:19 +02:00
|
|
|
if "noiseavgenergy" in features_to_compute:
|
|
|
|
conversation_features["conversation_" + day_segment + "_noiseavgenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_date"])["double_energy"].mean()
|
2020-06-19 07:27:28 +02:00
|
|
|
|
2020-08-23 19:20:19 +02:00
|
|
|
if "noisesdenergy" in features_to_compute:
|
|
|
|
conversation_features["conversation_" + day_segment + "_noisesdenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_date"])["double_energy"].std()
|
2020-06-19 07:27:28 +02:00
|
|
|
|
2020-08-23 19:20:19 +02:00
|
|
|
if "noiseminenergy" in features_to_compute:
|
|
|
|
conversation_features["conversation_" + day_segment + "_noiseminenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_date"])["double_energy"].min()
|
2020-06-19 07:27:28 +02:00
|
|
|
|
2020-08-23 19:20:19 +02:00
|
|
|
if "noisemaxenergy" in features_to_compute:
|
|
|
|
conversation_features["conversation_" + day_segment + "_noisemaxenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_date"])["double_energy"].max()
|
2020-06-19 07:27:28 +02:00
|
|
|
|
2020-08-23 19:20:19 +02:00
|
|
|
if "voicesumenergy" in features_to_compute:
|
|
|
|
conversation_features["conversation_" + day_segment + "_voicesumenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_date"])["double_energy"].sum()
|
2020-06-19 07:27:28 +02:00
|
|
|
|
2020-08-23 19:20:19 +02:00
|
|
|
if "voiceavgenergy" in features_to_compute:
|
|
|
|
conversation_features["conversation_" + day_segment + "_voiceavgenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_date"])["double_energy"].mean()
|
|
|
|
|
|
|
|
if "voicesdenergy" in features_to_compute:
|
|
|
|
conversation_features["conversation_" + day_segment + "_voicesdenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_date"])["double_energy"].std()
|
|
|
|
|
|
|
|
if "voiceminenergy" in features_to_compute:
|
|
|
|
conversation_features["conversation_" + day_segment + "_voiceminenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_date"])["double_energy"].min()
|
2020-06-19 07:27:28 +02:00
|
|
|
|
2020-08-23 19:20:19 +02:00
|
|
|
if "voicemaxenergy" in features_to_compute:
|
|
|
|
conversation_features["conversation_" + day_segment + "_voicemaxenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_date"])["double_energy"].max()
|
|
|
|
|
|
|
|
conversation_features = conversation_features.reset_index()
|
|
|
|
|
2020-06-19 07:27:28 +02:00
|
|
|
return conversation_features
|