import pandas as pd def straw_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): speech_data = pd.read_csv(sensor_data_files["sensor_data"]) requested_features = provider["FEATURES"] # name of the features this function can compute+ base_features_names = ["meanspeech", "stdspeech", "nlargest", "nsmallest", "medianspeech"] features_to_compute = list(set(requested_features) & set(base_features_names)) speech_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) if not speech_data.empty: speech_data = filter_data_by_segment(speech_data, time_segment) if not speech_data.empty: speech_features = pd.DataFrame() if "meanspeech" in features_to_compute: speech_features["meanspeech"] = speech_data.groupby(["local_segment"])['speech_proportion'].mean() if "stdspeech" in features_to_compute: speech_features["stdspeech"] = speech_data.groupby(["local_segment"])['speech_proportion'].std() if "nlargest" in features_to_compute: speech_features["nlargest"] = speech_data.groupby(["local_segment"])['speech_proportion'].apply(lambda x: x.nlargest(5).mean()) if "nsmallest" in features_to_compute: speech_features["nsmallest"] = speech_data.groupby(["local_segment"])['speech_proportion'].apply(lambda x: x.nsmallest(5).mean()) if "medianspeech" in features_to_compute: speech_features["medianspeech"] = speech_data.groupby(["local_segment"])['speech_proportion'].median() speech_features = speech_features.reset_index() return speech_features