diff --git a/Snakefile b/Snakefile index 44e9b5bf..b8631e4a 100644 --- a/Snakefile +++ b/Snakefile @@ -174,6 +174,15 @@ for provider in config["PHONE_ESM"]["PROVIDERS"].keys(): # files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv",pid=config["PIDS"])) # files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") +for provider in config["PHONE_SPEECH"]["PROVIDERS"].keys(): + if config["PHONE_SPEECH"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_speech_raw.csv",pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_speech_with_datetime.csv",pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_speech_features/phone_speech_{language}_{provider_key}.csv",pid=config["PIDS"],language=get_script_language(config["PHONE_SPEECH"]["PROVIDERS"][provider]["SRC_SCRIPT"]),provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_speech.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + # We can delete these if's as soon as we add feature PROVIDERS to any of these sensors if isinstance(config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"], dict): for provider in config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"].keys(): diff --git a/config.yaml b/config.yaml index 7a82190a..1c106e30 100644 --- a/config.yaml +++ b/config.yaml @@ -329,6 +329,15 @@ PHONE_SCREEN: EPISODE_TYPES: ["unlock"] SRC_SCRIPT: src/features/phone_screen/rapids/main.py +# Custom added sensor +PHONE_SPEECH: + CONTAINER: speech + PROVIDERS: + STRAW: + COMPUTE: True + FEATURES: ["meanspeech", "stdspeech", "nlargest", "nsmallest", "medianspeech"] + SRC_SCRIPT: src/features/phone_speech/straw/main.py + # See https://www.rapids.science/latest/features/phone-wifi-connected/ PHONE_WIFI_CONNECTED: CONTAINER: sensor_wifi diff --git a/rules/features.smk b/rules/features.smk index 2638a8f3..e6a3b0bd 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -345,6 +345,19 @@ rule esm_features: script: "../src/features/entry.py" +rule phone_speech_python_features: + input: + sensor_data = "data/raw/{pid}/phone_speech_with_datetime.csv", + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" + params: + provider = lambda wildcards: config["PHONE_SPEECH"]["PROVIDERS"][wildcards.provider_key.upper()], + provider_key = "{provider_key}", + sensor_key = "phone_speech" + output: + "data/interim/{pid}/phone_speech_features/phone_speech_python_{provider_key}.csv" + script: + "../src/features/entry.py" + rule phone_keyboard_python_features: input: sensor_data = "data/raw/{pid}/phone_keyboard_with_datetime.csv", diff --git a/src/data/streams/aware_postgresql/format.yaml b/src/data/streams/aware_postgresql/format.yaml index f4cf20cd..ebdd6062 100644 --- a/src/data/streams/aware_postgresql/format.yaml +++ b/src/data/streams/aware_postgresql/format.yaml @@ -349,3 +349,24 @@ PHONE_WIFI_VISIBLE: COLUMN_MAPPINGS: SCRIPTS: # List any python or r scripts that mutate your raw data +PHONE_SPEECH: + ANDROID: + RAPIDS_COLUMN_MAPPINGS: + TIMESTAMP: timestamp + DEVICE_ID: device_id + SPEECH_PROPORTION: speech_proportion + MUTATION: + COLUMN_MAPPINGS: + SCRIPTS: # List any python or r scripts that mutate your raw data + IOS: + RAPIDS_COLUMN_MAPPINGS: + TIMESTAMP: timestamp + DEVICE_ID: device_id + SPEECH_PROPORTION: speech_proportion + MUTATION: + COLUMN_MAPPINGS: + SCRIPTS: # List any python or r scripts that mutate your raw data + + + + diff --git a/src/features/phone_speech/straw/main.py b/src/features/phone_speech/straw/main.py new file mode 100644 index 00000000..68213dae --- /dev/null +++ b/src/features/phone_speech/straw/main.py @@ -0,0 +1,30 @@ +import pandas as pd + + +def straw_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): + speech_data = pd.read_csv(sensor_data_files["sensor_data"]) + requested_features = provider["FEATURES"] + # name of the features this function can compute+ + base_features_names = ["meanspeech", "stdspeech", "nlargest", "nsmallest", "medianspeech"] + features_to_compute = list(set(requested_features) & set(base_features_names)) + speech_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) + + if not speech_data.empty: + speech_data = filter_data_by_segment(speech_data, time_segment) + + if not speech_data.empty: + speech_features = pd.DataFrame() + if "meanspeech" in features_to_compute: + speech_features["meanspeech"] = speech_data.groupby(["local_segment"])['speech_proportion'].mean() + if "stdspeech" in features_to_compute: + speech_features["stdspeech"] = speech_data.groupby(["local_segment"])['speech_proportion'].std() + if "nlargest" in features_to_compute: + speech_features["nlargest"] = speech_data.groupby(["local_segment"])['speech_proportion'].apply(lambda x: x.nlargest(5).mean()) + if "nsmallest" in features_to_compute: + speech_features["nsmallest"] = speech_data.groupby(["local_segment"])['speech_proportion'].apply(lambda x: x.nsmallest(5).mean()) + if "medianspeech" in features_to_compute: + speech_features["medianspeech"] = speech_data.groupby(["local_segment"])['speech_proportion'].median() + + speech_features = speech_features.reset_index() + + return speech_features