From 75b054d3580a8957b5e0d18c7ccf81049d73b8e3 Mon Sep 17 00:00:00 2001 From: Primoz Date: Tue, 17 Jan 2023 14:00:14 +0000 Subject: [PATCH 1/3] Integrate phone_speech into rapids pipeline. --- Snakefile | 3 ++- rules/features.smk | 4 ++-- src/data/streams/aware_postgresql/format.yaml | 21 +++++++++++++++++++ 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/Snakefile b/Snakefile index c591650a..b8631e4a 100644 --- a/Snakefile +++ b/Snakefile @@ -178,9 +178,10 @@ for provider in config["PHONE_SPEECH"]["PROVIDERS"].keys(): if config["PHONE_SPEECH"]["PROVIDERS"][provider]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/phone_speech_raw.csv",pid=config["PIDS"])) files_to_compute.extend(expand("data/raw/{pid}/phone_speech_with_datetime.csv",pid=config["PIDS"])) - files_to_compute.extend(expand("data/interim/{pid}/phone_speech_clean.csv",pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_speech_features/phone_speech_{language}_{provider_key}.csv",pid=config["PIDS"],language=get_script_language(config["PHONE_SPEECH"]["PROVIDERS"][provider]["SRC_SCRIPT"]),provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_speech.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") # We can delete these if's as soon as we add feature PROVIDERS to any of these sensors if isinstance(config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"], dict): diff --git a/rules/features.smk b/rules/features.smk index 83d08568..e6a3b0bd 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -347,12 +347,12 @@ rule esm_features: rule phone_speech_python_features: input: - sensor_data = "data/interim/{pid}/phone_speech_with_datetime.csv", + sensor_data = "data/raw/{pid}/phone_speech_with_datetime.csv", time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_SPEECH"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", - sensor_key = "phone_speech", + sensor_key = "phone_speech" output: "data/interim/{pid}/phone_speech_features/phone_speech_python_{provider_key}.csv" script: diff --git a/src/data/streams/aware_postgresql/format.yaml b/src/data/streams/aware_postgresql/format.yaml index f4cf20cd..ebdd6062 100644 --- a/src/data/streams/aware_postgresql/format.yaml +++ b/src/data/streams/aware_postgresql/format.yaml @@ -349,3 +349,24 @@ PHONE_WIFI_VISIBLE: COLUMN_MAPPINGS: SCRIPTS: # List any python or r scripts that mutate your raw data +PHONE_SPEECH: + ANDROID: + RAPIDS_COLUMN_MAPPINGS: + TIMESTAMP: timestamp + DEVICE_ID: device_id + SPEECH_PROPORTION: speech_proportion + MUTATION: + COLUMN_MAPPINGS: + SCRIPTS: # List any python or r scripts that mutate your raw data + IOS: + RAPIDS_COLUMN_MAPPINGS: + TIMESTAMP: timestamp + DEVICE_ID: device_id + SPEECH_PROPORTION: speech_proportion + MUTATION: + COLUMN_MAPPINGS: + SCRIPTS: # List any python or r scripts that mutate your raw data + + + + From 4d0497a5e0c2a14cd610bb046467c7b6131c062a Mon Sep 17 00:00:00 2001 From: Primoz Date: Tue, 17 Jan 2023 14:00:42 +0000 Subject: [PATCH 2/3] Set appropriate calculations for speech senzor. --- config.yaml | 3 +- src/features/phone_speech/straw/main.py | 81 +++++++------------------ 2 files changed, 23 insertions(+), 61 deletions(-) diff --git a/config.yaml b/config.yaml index cf29679c..cd6a79a2 100644 --- a/config.yaml +++ b/config.yaml @@ -248,13 +248,12 @@ PHONE_ESM: FEATURES: [mean] SRC_SCRIPT: src/features/phone_esm/straw/main.py -# Custom sensor PHONE_SPEECH: CONTAINER: speech PROVIDERS: STRAW: COMPUTE: True - FEATURES: ["countscans"] + FEATURES: ["meanspeech", "stdspeech", "nlargest", "nsmallest", "medianspeech"] SRC_SCRIPT: src/features/phone_speech/straw/main.py # See https://www.rapids.science/latest/features/phone-keyboard/ diff --git a/src/features/phone_speech/straw/main.py b/src/features/phone_speech/straw/main.py index 1ea83f91..68213dae 100644 --- a/src/features/phone_speech/straw/main.py +++ b/src/features/phone_speech/straw/main.py @@ -1,67 +1,30 @@ import pandas as pd -# TODO: adjust features files -QUESTIONNAIRE_IDS = { - "sleep_quality": 1, - "PANAS_positive_affect": 8, - "PANAS_negative_affect": 9, - "JCQ_job_demand": 10, - "JCQ_job_control": 11, - "JCQ_supervisor_support": 12, - "JCQ_coworker_support": 13, - "PFITS_supervisor": 14, - "PFITS_coworkers": 15, - "UWES_vigor": 16, - "UWES_dedication": 17, - "UWES_absorption": 18, - "COPE_active": 19, - "COPE_support": 20, - "COPE_emotions": 21, - "balance_life_work": 22, - "balance_work_life": 23, - "recovery_experience_detachment": 24, - "recovery_experience_relaxation": 25, - "symptoms": 26, - "appraisal_stressfulness_event": 87, - "appraisal_threat": 88, - "appraisal_challenge": 89, - "appraisal_event_time": 90, - "appraisal_event_duration": 91, - "appraisal_event_work_related": 92, - "appraisal_stressfulness_period": 93, - "late_work": 94, - "work_hours": 95, - "left_work": 96, - "activities": 97, - "coffee_breaks": 98, - "at_work_yet": 99, -} - def straw_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): - esm_data = pd.read_csv(sensor_data_files["sensor_data"]) + speech_data = pd.read_csv(sensor_data_files["sensor_data"]) requested_features = provider["FEATURES"] - # name of the features this function can compute - requested_scales = provider["SCALES"] - base_features_names = ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support", - "appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"] - #TODO Check valid questionnaire and feature names. - # the subset of requested features this function can compute + # name of the features this function can compute+ + base_features_names = ["meanspeech", "stdspeech", "nlargest", "nsmallest", "medianspeech"] features_to_compute = list(set(requested_features) & set(base_features_names)) - esm_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) - if not esm_data.empty: - esm_data = filter_data_by_segment(esm_data, time_segment) + speech_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) + + if not speech_data.empty: + speech_data = filter_data_by_segment(speech_data, time_segment) - if not esm_data.empty: - esm_features = pd.DataFrame() - for scale in requested_scales: - questionnaire_id = QUESTIONNAIRE_IDS[scale] - mask = esm_data["questionnaire_id"] == questionnaire_id - esm_features[scale + "_mean"] = esm_data.loc[mask].groupby(["local_segment"])["esm_user_score"].mean() - #TODO Create the column esm_user_score in esm_clean. Currently, this is only done when reversing. + if not speech_data.empty: + speech_features = pd.DataFrame() + if "meanspeech" in features_to_compute: + speech_features["meanspeech"] = speech_data.groupby(["local_segment"])['speech_proportion'].mean() + if "stdspeech" in features_to_compute: + speech_features["stdspeech"] = speech_data.groupby(["local_segment"])['speech_proportion'].std() + if "nlargest" in features_to_compute: + speech_features["nlargest"] = speech_data.groupby(["local_segment"])['speech_proportion'].apply(lambda x: x.nlargest(5).mean()) + if "nsmallest" in features_to_compute: + speech_features["nsmallest"] = speech_data.groupby(["local_segment"])['speech_proportion'].apply(lambda x: x.nsmallest(5).mean()) + if "medianspeech" in features_to_compute: + speech_features["medianspeech"] = speech_data.groupby(["local_segment"])['speech_proportion'].median() + + speech_features = speech_features.reset_index() - esm_features = esm_features.reset_index() - if 'index' in esm_features: # In calse of empty esm_features df - esm_features.rename(columns={'index': 'local_segment'}, inplace=True) - - return esm_features + return speech_features From 7e37eb906700f2fe3d8991efc7bae514ac1c4a3f Mon Sep 17 00:00:00 2001 From: Primoz Date: Mon, 23 Jan 2023 15:32:52 +0000 Subject: [PATCH 3/3] Change SPEECH sensor place in config --- config.yaml | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/config.yaml b/config.yaml index cd6a79a2..943c4ed9 100644 --- a/config.yaml +++ b/config.yaml @@ -248,14 +248,6 @@ PHONE_ESM: FEATURES: [mean] SRC_SCRIPT: src/features/phone_esm/straw/main.py -PHONE_SPEECH: - CONTAINER: speech - PROVIDERS: - STRAW: - COMPUTE: True - FEATURES: ["meanspeech", "stdspeech", "nlargest", "nsmallest", "medianspeech"] - SRC_SCRIPT: src/features/phone_speech/straw/main.py - # See https://www.rapids.science/latest/features/phone-keyboard/ PHONE_KEYBOARD: CONTAINER: keyboard @@ -337,6 +329,15 @@ PHONE_SCREEN: EPISODE_TYPES: ["unlock"] SRC_SCRIPT: src/features/phone_screen/rapids/main.py +# Custom added sensor +PHONE_SPEECH: + CONTAINER: speech + PROVIDERS: + STRAW: + COMPUTE: True + FEATURES: ["meanspeech", "stdspeech", "nlargest", "nsmallest", "medianspeech"] + SRC_SCRIPT: src/features/phone_speech/straw/main.py + # See https://www.rapids.science/latest/features/phone-wifi-connected/ PHONE_WIFI_CONNECTED: CONTAINER: sensor_wifi