From 1409de28322f459cb19b20d313d7266429c63517 Mon Sep 17 00:00:00 2001 From: nikunjgoel95 Date: Fri, 19 Jun 2020 01:27:28 -0400 Subject: [PATCH] Added Converstion Features. Updated config.yaml, Snakefile, Features.Snakefile and documentation. --- Snakefile | 3 + config.yaml | 12 +- docs/features/extracted.rst | 72 ++++++++++++ rules/features.snakefile | 22 ++++ .../conversation/conversation_base.py | 104 ++++++++++++++++++ src/features/conversation_features.py | 15 +++ 6 files changed, 227 insertions(+), 1 deletion(-) create mode 100644 src/features/conversation/conversation_base.py create mode 100644 src/features/conversation_features.py diff --git a/Snakefile b/Snakefile index 633ca3a2..92abe82f 100644 --- a/Snakefile +++ b/Snakefile @@ -58,6 +58,9 @@ rule all: expand("data/processed/{pid}/light_{day_segment}.csv", pid = config["PIDS"], day_segment = config["LIGHT"]["DAY_SEGMENTS"]), + expand("data/processed/{pid}/conversation_{day_segment}.csv", + pid = config["PIDS"], + day_segment = config["CONVERSATION"]["DAY_SEGMENTS"]), expand("data/processed/{pid}/accelerometer_{day_segment}.csv", pid = config["PIDS"], day_segment = config["ACCELEROMETER"]["DAY_SEGMENTS"]), diff --git a/config.yaml b/config.yaml index a5e5cdf6..f731b304 100644 --- a/config.yaml +++ b/config.yaml @@ -1,5 +1,5 @@ # Valid database table names -SENSORS: [applications_crashes, applications_foreground, applications_notifications, battery, bluetooth, calls, locations, messages, plugin_ambient_noise, plugin_device_usage, plugin_google_activity_recognition, plugin_ios_activity_recognition, screen] +SENSORS: [applications_crashes, applications_foreground, applications_notifications, battery, bluetooth, calls, locations, messages, plugin_ambient_noise, plugin_device_usage, plugin_google_activity_recognition, plugin_ios_activity_recognition, screen,plugin_studentlife_audio] FITBIT_TABLE: [fitbit_data] FITBIT_SENSORS: [heartrate, steps, sleep, calories] @@ -138,6 +138,16 @@ WIFI: DAY_SEGMENTS: *day_segments FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] +CONVERSATION: + DAY_SEGMENTS: *day_segments + FEATURES: ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration", + "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","sumenergy", + "avgenergy","sdenergy","minenergy","maxenergy","silencesensedfraction","noisesensedfraction", + "voicesensedfraction","unknownsensedfraction","silenceexpectedfraction","noiseexpectedfraction","voiceexpectedfraction", + "unknownexpectedfraction"] + RECORDINGMINUTES: 1 + PAUSEDMINUTES : 3 + PARAMS_FOR_ANALYSIS: GROUNDTRUTH_TABLE: participant_info SOURCES: &sources ["phone_features", "fitbit_features", "phone_fitbit_features"] diff --git a/docs/features/extracted.rst b/docs/features/extracted.rst index 8d03553d..817d1a7d 100644 --- a/docs/features/extracted.rst +++ b/docs/features/extracted.rst @@ -688,6 +688,78 @@ firstuseafter minutes Seconds until the first unlock e An ``unlock`` episode is considered as the time between an ``unlock`` event and a ``lock`` event. iOS recorded these episodes reliably (albeit some duplicated ``lock`` events within milliseconds from each other). However, in Android there are some events unrelated to the screen state because of multiple consecutive ``unlock``/``lock`` events, so we keep the closest pair. In our experiments these cases are less than 10% of the screen events collected. This happens because ``ACTION_SCREEN_OFF`` and ``ON`` are "sent when the device becomes non-interactive which may have nothing to do with the screen turning off". Additionally, in Android it is possible to measure the time spent on the ``lock`` screen before an ``unlock`` event as well as the total screen time (i.e. ``ON`` to ``OFF``) but we are only keeping ``unlock`` episodes (``unlock`` to ``OFF``) to be consistent with iOS. +.. _conversation-sensor-doc: + +Conversation +"""""""" + +See `Conversation Config Code`_ + +**Available Epochs (day_segment) :** daily, morning, afternoon, evening, night + +**Available Platforms:** Android and iOS + +**Snakefile entry to compute these features:** + + | ``expand("data/processed/{pid}/conversation_{day_segment}.csv",`` + | ``pid = config["PIDS"],`` + | ``day_segment = config["CONVERSATION"]["DAY_SEGMENTS"]),`` + +**Snakemake rule chain:** + +- Rule ``rules/preprocessing.snakefile/download_dataset`` +- Rule ``rules/preprocessing.snakefile/readable_datetime`` +- Rule ``rules/features.snakefile/conversation_features`` + +.. _conversation-parameters: + +**Conversation Rule Parameters (conversation_features):** + +========================= =================== +Name Description +========================= =================== +day_segment The particular ``day_segments`` that will be analyzed. The available options are ``daily``, ``morning``, ``afternoon``, ``evening``, ``night`` +recordingMinutes The current default configuration is 1 min recording/3 min pause. +features_deltas Features to be computed, see table below +pausedMinutes The current default configuration is 1 min recording/3 min pause. +========================= =================== + +.. _conversation-available-features: + +**Available Conversation Features** + +========================= ================= ============= +Name Units Description +========================= ================= ============= +minutessilence minutes Total duration of all minutes silence. +minutesnoise minutes Total duration of all minutes noise. +minutesvoice minutes Total duration of all minutes voice. +minutesunknown minutes Total duration of all minutes unknown. +sumconversationduration minutes Total duration of all the conversation. +maxconversationduration minutes Longest duration of all the conversation. +minconversationduration minutes Shortest duration of all the conversation. +avgconversationduration minutes Average duration of all the conversation. +sdconversationduration minutes Standard Deviation duration of all the conversation. +timefirstconversation minutes Starting time of first conversation of the Day/Epoch. +timelastconversation minutes Starting time of last conversation of the Day/Epoch. +sumenergy L2-norm Total sum of all the energy. +avgenergy L2-norm Average of all the energy. +sdenergy L2-norm Standard Deviation of all the energy. +minenergy L2-norm Minimum of all the energy. +maxenergy L2-norm Maximum of all the energy. +silencesensedfraction minutes +noisesensedfraction minutes +voicesensedfraction minutes +unknownsensedfraction minutes +silenceexpectedfraction minutes +noiseexpectedfraction minutes +voiceexpectedfraction minutes +unknownexpectedfraction minutes +========================= ================= ============= + +**Assumptions/Observations:** + + .. ------------------------------- Begin Fitbit Section ----------------------------------- .. .. _fitbit-sleep-sensor-doc: diff --git a/rules/features.snakefile b/rules/features.snakefile index 7f3ac7f4..f27b6721 100644 --- a/rules/features.snakefile +++ b/rules/features.snakefile @@ -9,6 +9,15 @@ def optional_ar_input(wildcards): return ["data/raw/{pid}/plugin_ios_activity_recognition_with_datetime_unified.csv", "data/processed/{pid}/plugin_ios_activity_recognition_deltas.csv"] +def optional_conversation_input(wildcards): + with open("data/external/"+wildcards.pid, encoding="ISO-8859-1") as external_file: + external_file_content = external_file.readlines() + platform = external_file_content[1].strip() + if platform == "android": + return ["data/raw/{pid}/plugin_studentlife_audio_android_with_datetime.csv"] + else: + return ["data/raw/{pid}/plugin_studentlife_audio_with_datetime.csv"] + def optional_location_input(wildcards): if config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED": return rules.resample_fused_location.output @@ -146,6 +155,19 @@ rule light_features: script: "../src/features/light_features.py" +rule conversation_features: + input: + optional_conversation_input + params: + day_segment = "{day_segment}", + features = config["CONVERSATION"]["FEATURES"], + recordingMinutes = config["CONVERSATION"]["RECORDINGMINUTES"], + pausedMinutes = config["CONVERSATION"]["PAUSEDMINUTES"], + output: + "data/processed/{pid}/conversation_{day_segment}.csv" + script: + "../src/features/conversation_features.py" + rule accelerometer_features: input: "data/raw/{pid}/accelerometer_with_datetime.csv", diff --git a/src/features/conversation/conversation_base.py b/src/features/conversation/conversation_base.py new file mode 100644 index 00000000..fb1fd103 --- /dev/null +++ b/src/features/conversation/conversation_base.py @@ -0,0 +1,104 @@ +import pandas as pd + +def base_conversation_features(conversation_data, day_segment, requested_features,recordingMinutes,pausedMinutes,expectedMinutes): + # name of the features this function can compute + base_features_names = ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration", + "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","sumenergy", + "avgenergy","sdenergy","minenergy","maxenergy","silencesensedfraction","noisesensedfraction", + "voicesensedfraction","unknownsensedfraction","silenceexpectedfraction","noiseexpectedfraction","voiceexpectedfraction", + "unknownexpectedfraction"] + + # the subset of requested features this function can compute + features_to_compute = list(set(requested_features) & set(base_features_names)) + + + if conversation_data.empty: + conversation_features = pd.DataFrame(columns=["local_date"] + ["conversation_" + day_segment + "_" + x for x in features_to_compute]) + else: + if day_segment != "daily": + conversation_data = conversation_data[conversation_data["local_day_segment"] == day_segment] + + if conversation_data.empty: + conversation_features = pd.DataFrame(columns=["local_date"] + ["conversation_" + day_segment + "_" + x for x in features_to_compute]) + else: + conversation_features = pd.DataFrame() + + if "minutessilence" in features_to_compute: + conversation_features["conversation_" + day_segment + "_minutessilence"] = conversation_data[conversation_data['inference']==0].groupby(["local_date"])['inference'].count() + + if "minutesnoise" in features_to_compute: + conversation_features["conversation_" + day_segment + "_minutesnoise"] = conversation_data[conversation_data['inference']==1].groupby(["local_date"])['inference'].count() + + if "minutesvoice" in features_to_compute: + conversation_features["conversation_" + day_segment + "_minutesvoice"] = conversation_data[conversation_data['inference']==2].groupby(["local_date"])['inference'].count() + + if "minutesunknown" in features_to_compute: + conversation_features["conversation_" + day_segment + "_minutesunknown"] = conversation_data[conversation_data['inference']==3].groupby(["local_date"])['inference'].count() + + conversation_data['conv_Dur'] = conversation_data['double_convo_end'] - conversation_data['double_convo_start'] + conversation_data['totalDuration'] = conversation_data[conversation_data['inference']==0].groupby(["local_date"])['inference'].count() + conversation_data[conversation_data['inference']==1].groupby(["local_date"])['inference'].count() + conversation_data[conversation_data['inference']==2].groupby(["local_date"])['inference'].count() + conversation_data[conversation_data['inference']==3].groupby(["local_date"])['inference'].count() + + if "silencesensedfraction" in features_to_compute: + conversation_features["conversation_" + day_segment + "_silencesensedfraction"] = conversation_data[conversation_data['inference']==0].groupby(["local_date"])['inference'].count()/ conversation_data['totalDuration'] + + if "noisesensedfraction" in features_to_compute: + conversation_features["conversation_" + day_segment + "_noisesensedfraction"] = conversation_data[conversation_data['inference']==1].groupby(["local_date"])['inference'].count()/ conversation_data['totalDuration'] + + if "voicesensedfraction" in features_to_compute: + conversation_features["conversation_" + day_segment + "_voicesensedfraction"] = conversation_data[conversation_data['inference']==2].groupby(["local_date"])['inference'].count()/ conversation_data['totalDuration'] + + if "unknownsensedfraction" in features_to_compute: + conversation_features["conversation_" + day_segment + "_unknownsensedfraction"] = conversation_data[conversation_data['inference']==3].groupby(["local_date"])['inference'].count()/ conversation_data['totalDuration'] + + if "silenceexpectedfraction" in features_to_compute: + conversation_features["conversation_" + day_segment + "_silenceexpectedfraction"] = conversation_data[conversation_data['inference']==0].groupby(["local_date"])['inference'].count()/ expectedMinutes + + if "noiseexpectedfraction" in features_to_compute: + conversation_features["conversation_" + day_segment + "_noiseexpectedfraction"] = conversation_data[conversation_data['inference']==1].groupby(["local_date"])['inference'].count()/ expectedMinutes + + if "voiceexpectedfraction" in features_to_compute: + conversation_features["conversation_" + day_segment + "_voiceexpectedfraction"] = conversation_data[conversation_data['inference']==2].groupby(["local_date"])['inference'].count()/ expectedMinutes + + if "unknownexpectedfraction" in features_to_compute: + conversation_features["conversation_" + day_segment + "_unknownexpectedfraction"] = conversation_data[conversation_data['inference']==3].groupby(["local_date"])['inference'].count()/ expectedMinutes + + if "sumconversationduration" in features_to_compute: + conversation_features["conversation_" + day_segment + "_sumconversationduration"] = conversation_data.groupby(["local_date"])['conv_Dur'].sum() + + if "avgconversationduration" in features_to_compute: + conversation_features["conversation_" + day_segment + "_avgconversationduration"] = conversation_data.groupby(["local_date"])['conv_Dur'].mean() + + if "sdconversationduration" in features_to_compute: + conversation_features["conversation_" + day_segment + "_sdconversationduration"] = conversation_data.groupby(["local_date"])['conv_Dur'].std() + + if "minconversationduration" in features_to_compute: + conversation_features["conversation_" + day_segment + "_minconversationduration"] = conversation_data.groupby(["local_date"])['conv_Dur'].min() + + if "maxconversationduration" in features_to_compute: + conversation_features["conversation_" + day_segment + "_maxconversationduration"] = conversation_data.groupby(["local_date"])['conv_Dur'].max() + + if "timefirstconversation" in features_to_compute: + conversation_features["conversation_" + day_segment + "_timefirstconversation"] = conversation_data[conversation_data["double_convo_start"]> 0].groupby(["local_date"])['double_convo_start'].min() + + if "timelastconversation" in features_to_compute: + conversation_features["conversation_" + day_segment + "_timelastconversation"] = conversation_data.groupby(["local_date"])['double_convo_start'].max() + + if "sumenergy" in features_to_compute: + conversation_features["conversation_" + day_segment + "_sumenergy"] = conversation_data.groupby(["local_date"])['double_energy'].sum() + + if "avgenergy" in features_to_compute: + conversation_features["conversation_" + day_segment + "_avgenergy"] = conversation_data.groupby(["local_date"])['double_energy'].mean() + + if "sdenergy" in features_to_compute: + conversation_features["conversation_" + day_segment + "_sdenergy"] = conversation_data.groupby(["local_date"])['double_energy'].std() + + if "minenergy" in features_to_compute: + conversation_features["conversation_" + day_segment + "_minenergy"] = conversation_data.groupby(["local_date"])['double_energy'].min() + + if "maxenergy" in features_to_compute: + conversation_features["conversation_" + day_segment + "_maxenergy"] = conversation_data.groupby(["local_date"])['double_energy'].max() + + + conversation_features = conversation_features.reset_index() + + return conversation_features \ No newline at end of file diff --git a/src/features/conversation_features.py b/src/features/conversation_features.py new file mode 100644 index 00000000..3b0f6b29 --- /dev/null +++ b/src/features/conversation_features.py @@ -0,0 +1,15 @@ +import pandas as pd +from conversation.conversation_base import base_conversation_features + +conversation_data = pd.read_csv(snakemake.input[0], parse_dates=["local_date_time", "local_date"]) +day_segment = snakemake.params["day_segment"] +requested_features = snakemake.params["features"] +recordingMinutes = snakemake.params["recordingMinutes"] +pausedMinutes = snakemake.params["pausedMinutes"] +expectedMinutes = 1440 / (recordingMinutes + pausedMinutes) +conversation_features = pd.DataFrame(columns=["local_date"]) + +conversation_features = conversation_features.merge(base_conversation_features(conversation_data, day_segment, requested_features,recordingMinutes,pausedMinutes,expectedMinutes), on="local_date", how="outer") +assert len(requested_features) + 1 == conversation_features.shape[1], "The number of features in the output dataframe (=" + str(conversation_features.shape[1]) + ") does not match the expected value (=" + str(len(requested_features)) + " + 1). Verify your conversation feature extraction functions" + +conversation_features.to_csv(snakemake.output[0], index=False) \ No newline at end of file