rapids/src/features/phone_speech/straw/main.py

import pandas as pd


def straw_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
    speech_data = pd.read_csv(sensor_data_files["sensor_data"])
    requested_features = provider["FEATURES"]
    # name of the features this function can compute+
    base_features_names = ["meanspeech", "stdspeech", "nlargest", "nsmallest", "medianspeech"]
    features_to_compute = list(set(requested_features) & set(base_features_names))
    speech_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)

    if not speech_data.empty:
        speech_data = filter_data_by_segment(speech_data, time_segment)

        if not speech_data.empty:
            speech_features = pd.DataFrame()
            if "meanspeech" in features_to_compute:
                speech_features["meanspeech"] = speech_data.groupby(["local_segment"])['speech_proportion'].mean()
            if "stdspeech" in features_to_compute:
                speech_features["stdspeech"] = speech_data.groupby(["local_segment"])['speech_proportion'].std()
            if "nlargest" in features_to_compute:
                speech_features["nlargest"] = speech_data.groupby(["local_segment"])['speech_proportion'].apply(lambda x: x.nlargest(5).mean())
            if "nsmallest" in features_to_compute:
                speech_features["nsmallest"] = speech_data.groupby(["local_segment"])['speech_proportion'].apply(lambda x: x.nsmallest(5).mean())
            if "medianspeech" in features_to_compute:
                speech_features["medianspeech"] = speech_data.groupby(["local_segment"])['speech_proportion'].median()

            speech_features = speech_features.reset_index()

    return speech_features