59 lines
3.2 KiB
Python
59 lines
3.2 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import math as m
|
|
|
|
import sys
|
|
|
|
def extract_second_order_features(intraday_features, so_features_names, prefix=""):
|
|
|
|
if prefix:
|
|
groupby_cols = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
|
|
else:
|
|
groupby_cols = ['local_segment']
|
|
|
|
if not intraday_features.empty:
|
|
so_features = pd.DataFrame()
|
|
#print(intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).nsmallest())
|
|
if "mean" in so_features_names:
|
|
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).mean().add_suffix("_SO_mean")], axis=1)
|
|
|
|
if "median" in so_features_names:
|
|
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).median().add_suffix("_SO_median")], axis=1)
|
|
|
|
if "sd" in so_features_names:
|
|
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).std().fillna(0).add_suffix("_SO_sd")], axis=1)
|
|
|
|
if "nlargest" in so_features_names: # largest 5 -- maybe there is a faster groupby solution?
|
|
for column in intraday_features.loc[:, ~intraday_features.columns.isin(groupby_cols+[prefix+"level_1"])]:
|
|
so_features[column+"_SO_nlargest"] = intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols)[column].apply(lambda x: x.nlargest(5).mean())
|
|
|
|
if "nsmallest" in so_features_names: # smallest 5 -- maybe there is a faster groupby solution?
|
|
for column in intraday_features.loc[:, ~intraday_features.columns.isin(groupby_cols+[prefix+"level_1"])]:
|
|
so_features[column+"_SO_nsmallest"] = intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols)[column].apply(lambda x: x.nsmallest(5).mean())
|
|
|
|
if "count_windows" in so_features_names:
|
|
so_features["SO_windowsCount"] = intraday_features.groupby(groupby_cols).count()[prefix+"level_1"]
|
|
|
|
# numPeaksNonZero specialized for EDA sensor
|
|
if "eda_num_peaks_non_zero" in so_features_names and prefix+"numPeaks" in intraday_features.columns:
|
|
so_features[prefix+"SO_numPeaksNonZero"] = intraday_features.groupby(groupby_cols)[prefix+"numPeaks"].apply(lambda x: (x!=0).sum())
|
|
|
|
# numWindowsNonZero specialized for BVP and IBI sensors
|
|
if "hrv_num_windows_non_nan" in so_features_names and prefix+"meanHr" in intraday_features.columns:
|
|
so_features[prefix+"SO_numWindowsNonNaN"] = intraday_features.groupby(groupby_cols)[prefix+"meanHr"].apply(lambda x: (~np.isnan(x)).sum())
|
|
|
|
so_features.reset_index(inplace=True)
|
|
|
|
else:
|
|
so_features = pd.DataFrame(columns=groupby_cols)
|
|
|
|
return so_features
|
|
|
|
def get_sample_rate(data): # To-Do get the sample rate information from the file's metadata
|
|
try:
|
|
timestamps_diff = data['timestamp'].diff().dropna().mean()
|
|
print("Timestamp diff:", timestamps_diff)
|
|
except:
|
|
raise Exception("Error occured while trying to get the mean sample rate from the data.")
|
|
|
|
return m.ceil(1000/timestamps_diff) |