Refactor fitbit heartrate features

pull/95/head
Meng Li 2020-06-12 16:04:03 -04:00
parent 3287866f4b
commit dcc7ca14e3
4 changed files with 89 additions and 74 deletions

View File

@ -115,8 +115,9 @@ APPLICATIONS_FOREGROUND:
HEARTRATE:
DAY_SEGMENTS: *day_segments
FEATURES: ["maxhr", "minhr", "avghr", "medianhr", "modehr", "stdhr", "diffmaxmodehr", "diffminmodehr", "entropyhr", "lengthoutofrange", "lengthfatburn", "lengthcardio", "lengthpeak"]
DAILY_FEATURES_FROM_SUMMARY_DATA: ["restinghr"] # calories related features might be inaccurate: ["caloriesoutofrange", "caloriesfatburn", "caloriescardio", "caloriespeak"]
# Only daily features are extracted from summary data
SUMMARY_FEATURES: ["restinghr"] # calories related features might be inaccurate: ["caloriesoutofrange", "caloriesfatburn", "caloriescardio", "caloriespeak"]
INTRADAY_FEATURES: ["maxhr", "minhr", "avghr", "medianhr", "modehr", "stdhr", "diffmaxmodehr", "diffminmodehr", "entropyhr", "lengthoutofrange", "lengthfatburn", "lengthcardio", "lengthpeak"]
STEP:
DAY_SEGMENTS: *day_segments

View File

@ -190,8 +190,8 @@ rule fitbit_heartrate_features:
heartrate_intraday_data = "data/raw/{pid}/fitbit_heartrate_intraday_with_datetime.csv"
params:
day_segment = "{day_segment}",
features = config["HEARTRATE"]["FEATURES"],
daily_features_from_summary_data = config["HEARTRATE"]["DAILY_FEATURES_FROM_SUMMARY_DATA"]
summary_features = config["HEARTRATE"]["SUMMARY_FEATURES"],
intraday_features = config["HEARTRATE"]["INTRADAY_FEATURES"]
output:
"data/processed/{pid}/fitbit_heartrate_{day_segment}.csv"
script:

View File

@ -0,0 +1,76 @@
import pandas as pd
from scipy.stats import entropy
def extractHRFeaturesFromSummaryData(heartrate_summary_data, summary_features):
heartrate_summary_features = pd.DataFrame()
if "restinghr" in summary_features:
heartrate_summary_features["heartrate_daily_restinghr"] = heartrate_summary_data["heartrate_daily_restinghr"]
# calories features might be inaccurate: they depend on users' fitbit profile (weight, height, etc.)
if "caloriesoutofrange" in summary_features:
heartrate_summary_features["heartrate_daily_caloriesoutofrange"] = heartrate_summary_data["heartrate_daily_caloriesoutofrange"]
if "caloriesfatburn" in summary_features:
heartrate_summary_features["heartrate_daily_caloriesfatburn"] = heartrate_summary_data["heartrate_daily_caloriesfatburn"]
if "caloriescardio" in summary_features:
heartrate_summary_features["heartrate_daily_caloriescardio"] = heartrate_summary_data["heartrate_daily_caloriescardio"]
if "caloriespeak" in summary_features:
heartrate_summary_features["heartrate_daily_caloriespeak"] = heartrate_summary_data["heartrate_daily_caloriespeak"]
heartrate_summary_features.reset_index(inplace=True)
return heartrate_summary_features
def extractHRFeaturesFromIntradayData(heartrate_intraday_data, features, day_segment):
heartrate_intraday_features = pd.DataFrame(columns=["local_date"] + ["heartrate_" + day_segment + "_" + x for x in features])
if not heartrate_intraday_data.empty:
device_id = heartrate_intraday_data["device_id"][0]
num_rows_per_minute = heartrate_intraday_data.groupby(["local_date", "local_hour", "local_minute"]).count().mean()["device_id"]
if day_segment != "daily":
heartrate_intraday_data = heartrate_intraday_data[heartrate_intraday_data["local_day_segment"] == day_segment]
if not heartrate_intraday_data.empty:
heartrate_intraday_features = pd.DataFrame()
# get stats of heartrate
if "maxhr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_maxhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].max()
if "minhr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_minhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].min()
if "avghr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_avghr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].mean()
if "medianhr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_medianhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].median()
if "modehr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_modehr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].agg(lambda x: pd.Series.mode(x)[0])
if "stdhr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_stdhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].std()
if "diffmaxmodehr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_diffmaxmodehr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].max() - heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].agg(lambda x: pd.Series.mode(x)[0])
if "diffminmodehr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_diffminmodehr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].agg(lambda x: pd.Series.mode(x)[0]) - heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].min()
if "entropyhr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_entropyhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].agg(entropy)
# get number of minutes in each heart rate zone
for feature_name in list(set(["lengthoutofrange", "lengthfatburn", "lengthcardio", "lengthpeak"]) & set(features)):
heartrate_zone = heartrate_intraday_data[heartrate_intraday_data["heartrate_zone"] == feature_name[6:]]
heartrate_intraday_features["heartrate_" + day_segment + "_" + feature_name] = heartrate_zone.groupby(["local_date"])["device_id"].count() / num_rows_per_minute
heartrate_intraday_features.fillna(value={"heartrate_" + day_segment + "_" + feature_name: 0}, inplace=True)
heartrate_intraday_features.reset_index(inplace=True)
return heartrate_intraday_features
def base_fitbit_heartrate_features(heartrate_summary_data, heartrate_intraday_data, day_segment, requested_summary_features, requested_intraday_features):
# name of the features this function can compute
base_summary_features_names = ["restinghr", "caloriesoutofrange", "caloriesfatburn", "caloriescardio", "caloriespeak"]
base_intraday_features_names = ["maxhr", "minhr", "avghr", "medianhr", "modehr", "stdhr", "diffmaxmodehr", "diffminmodehr", "entropyhr", "lengthoutofrange", "lengthfatburn", "lengthcardio", "lengthpeak"]
# the subset of requested features this function can compute
summary_features_to_compute = list(set(requested_summary_features) & set(base_summary_features_names))
intraday_features_to_compute = list(set(requested_intraday_features) & set(base_intraday_features_names))
heartrate_intraday_features = extractHRFeaturesFromIntradayData(heartrate_intraday_data, intraday_features_to_compute, day_segment)
if not heartrate_summary_data.empty and day_segment == "daily" and summary_features_to_compute != []:
heartrate_summary_features = extractHRFeaturesFromSummaryData(heartrate_summary_data, summary_features_to_compute)
heartrate_features = heartrate_intraday_features.merge(heartrate_summary_features, on=["local_date"], how="outer")
else:
heartrate_features = heartrate_intraday_features
return heartrate_features

View File

@ -1,78 +1,16 @@
import pandas as pd
import numpy as np
from scipy.stats import entropy
import json
def extractHRFeaturesFromSummaryData(heartrate_summary_data, daily_features_from_summary_data):
heartrate_summary_features = pd.DataFrame()
if "restinghr" in daily_features_from_summary_data:
heartrate_summary_features["heartrate_daily_restinghr"] = heartrate_summary_data["heartrate_daily_restinghr"]
# calories features might be inaccurate: they depend on users' fitbit profile (weight, height, etc.)
if "caloriesoutofrange" in daily_features_from_summary_data:
heartrate_summary_features["heartrate_daily_caloriesoutofrange"] = heartrate_summary_data["heartrate_daily_caloriesoutofrange"]
if "caloriesfatburn" in daily_features_from_summary_data:
heartrate_summary_features["heartrate_daily_caloriesfatburn"] = heartrate_summary_data["heartrate_daily_caloriesfatburn"]
if "caloriescardio" in daily_features_from_summary_data:
heartrate_summary_features["heartrate_daily_caloriescardio"] = heartrate_summary_data["heartrate_daily_caloriescardio"]
if "caloriespeak" in daily_features_from_summary_data:
heartrate_summary_features["heartrate_daily_caloriespeak"] = heartrate_summary_data["heartrate_daily_caloriespeak"]
heartrate_summary_features.reset_index(inplace=True)
return heartrate_summary_features
def extractHRFeaturesFromIntradayData(heartrate_intraday_data, features):
heartrate_intraday_features = pd.DataFrame(columns=["local_date"] + ["heartrate_" + day_segment + "_" + x for x in features])
if not heartrate_intraday_data.empty:
device_id = heartrate_intraday_data["device_id"][0]
num_rows_per_minute = heartrate_intraday_data.groupby(["local_date", "local_hour", "local_minute"]).count().mean()["device_id"]
if day_segment != "daily":
heartrate_intraday_data = heartrate_intraday_data[heartrate_intraday_data["local_day_segment"] == day_segment]
if not heartrate_intraday_data.empty:
heartrate_intraday_features = pd.DataFrame()
# get stats of heartrate
if "maxhr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_maxhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].max()
if "minhr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_minhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].min()
if "avghr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_avghr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].mean()
if "medianhr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_medianhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].median()
if "modehr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_modehr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].agg(lambda x: pd.Series.mode(x)[0])
if "stdhr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_stdhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].std()
if "diffmaxmodehr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_diffmaxmodehr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].max() - heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].agg(lambda x: pd.Series.mode(x)[0])
if "diffminmodehr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_diffminmodehr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].agg(lambda x: pd.Series.mode(x)[0]) - heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].min()
if "entropyhr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_entropyhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].agg(entropy)
# get number of minutes in each heart rate zone
for feature_name in list(set(["lengthoutofrange", "lengthfatburn", "lengthcardio", "lengthpeak"]) & set(features)):
heartrate_zone = heartrate_intraday_data[heartrate_intraday_data["heartrate_zone"] == feature_name[6:]]
heartrate_intraday_features["heartrate_" + day_segment + "_" + feature_name] = heartrate_zone.groupby(["local_date"])["device_id"].count() / num_rows_per_minute
heartrate_intraday_features.fillna(value={"heartrate_" + day_segment + "_" + feature_name: 0}, inplace=True)
heartrate_intraday_features.reset_index(inplace=True)
return heartrate_intraday_features
from fitbit_heartrate.fitbit_heartrate_base import base_fitbit_heartrate_features
heartrate_summary_data = pd.read_csv(snakemake.input["heartrate_summary_data"], index_col=["local_date"], parse_dates=["local_date"])
heartrate_intraday_data = pd.read_csv(snakemake.input["heartrate_intraday_data"], parse_dates=["local_date_time", "local_date"])
day_segment = snakemake.params["day_segment"]
features = snakemake.params["features"]
daily_features_from_summary_data = snakemake.params["daily_features_from_summary_data"]
requested_summary_features = snakemake.params["summary_features"]
requested_intraday_features = snakemake.params["intraday_features"]
heartrate_features = pd.DataFrame(columns=["local_date"])
heartrate_intraday_features = extractHRFeaturesFromIntradayData(heartrate_intraday_data, features)
if not heartrate_summary_data.empty and day_segment == "daily" and daily_features_from_summary_data != []:
heartrate_summary_features = extractHRFeaturesFromSummaryData(heartrate_summary_data, daily_features_from_summary_data)
heartrate_features = heartrate_intraday_features.merge(heartrate_summary_features, on=["local_date"], how="outer")
else:
heartrate_features = heartrate_intraday_features
heartrate_features = heartrate_features.merge(base_fitbit_heartrate_features(heartrate_summary_data, heartrate_intraday_data, day_segment, requested_summary_features, requested_intraday_features), on="local_date", how="outer")
requested_features = requested_summary_features + requested_intraday_features if day_segment == "daily" else requested_intraday_features
assert len(requested_features) + 1 == heartrate_features.shape[1], "The number of features in the output dataframe (=" + str(heartrate_features.shape[1]) + ") does not match the expected value (=" + str(len(requested_features)) + " + 1). Verify your fitbit heartrate feature extraction functions"
heartrate_features.to_csv(snakemake.output[0], index=False)