diff --git a/config.yaml b/config.yaml index 456c1cb1..a36d9360 100644 --- a/config.yaml +++ b/config.yaml @@ -115,8 +115,9 @@ APPLICATIONS_FOREGROUND: HEARTRATE: DAY_SEGMENTS: *day_segments - FEATURES: ["maxhr", "minhr", "avghr", "medianhr", "modehr", "stdhr", "diffmaxmodehr", "diffminmodehr", "entropyhr", "lengthoutofrange", "lengthfatburn", "lengthcardio", "lengthpeak"] - DAILY_FEATURES_FROM_SUMMARY_DATA: ["restinghr"] # calories related features might be inaccurate: ["caloriesoutofrange", "caloriesfatburn", "caloriescardio", "caloriespeak"] + # Only daily features are extracted from summary data + SUMMARY_FEATURES: ["restinghr"] # calories related features might be inaccurate: ["caloriesoutofrange", "caloriesfatburn", "caloriescardio", "caloriespeak"] + INTRADAY_FEATURES: ["maxhr", "minhr", "avghr", "medianhr", "modehr", "stdhr", "diffmaxmodehr", "diffminmodehr", "entropyhr", "lengthoutofrange", "lengthfatburn", "lengthcardio", "lengthpeak"] STEP: DAY_SEGMENTS: *day_segments diff --git a/rules/features.snakefile b/rules/features.snakefile index 5b38e15a..7f1a1088 100644 --- a/rules/features.snakefile +++ b/rules/features.snakefile @@ -190,8 +190,8 @@ rule fitbit_heartrate_features: heartrate_intraday_data = "data/raw/{pid}/fitbit_heartrate_intraday_with_datetime.csv" params: day_segment = "{day_segment}", - features = config["HEARTRATE"]["FEATURES"], - daily_features_from_summary_data = config["HEARTRATE"]["DAILY_FEATURES_FROM_SUMMARY_DATA"] + summary_features = config["HEARTRATE"]["SUMMARY_FEATURES"], + intraday_features = config["HEARTRATE"]["INTRADAY_FEATURES"] output: "data/processed/{pid}/fitbit_heartrate_{day_segment}.csv" script: diff --git a/src/features/fitbit_heartrate/fitbit_heartrate_base.py b/src/features/fitbit_heartrate/fitbit_heartrate_base.py new file mode 100644 index 00000000..c2dbff99 --- /dev/null +++ b/src/features/fitbit_heartrate/fitbit_heartrate_base.py @@ -0,0 +1,76 @@ +import pandas as pd +from scipy.stats import entropy + +def extractHRFeaturesFromSummaryData(heartrate_summary_data, summary_features): + heartrate_summary_features = pd.DataFrame() + if "restinghr" in summary_features: + heartrate_summary_features["heartrate_daily_restinghr"] = heartrate_summary_data["heartrate_daily_restinghr"] + # calories features might be inaccurate: they depend on users' fitbit profile (weight, height, etc.) + if "caloriesoutofrange" in summary_features: + heartrate_summary_features["heartrate_daily_caloriesoutofrange"] = heartrate_summary_data["heartrate_daily_caloriesoutofrange"] + if "caloriesfatburn" in summary_features: + heartrate_summary_features["heartrate_daily_caloriesfatburn"] = heartrate_summary_data["heartrate_daily_caloriesfatburn"] + if "caloriescardio" in summary_features: + heartrate_summary_features["heartrate_daily_caloriescardio"] = heartrate_summary_data["heartrate_daily_caloriescardio"] + if "caloriespeak" in summary_features: + heartrate_summary_features["heartrate_daily_caloriespeak"] = heartrate_summary_data["heartrate_daily_caloriespeak"] + heartrate_summary_features.reset_index(inplace=True) + + return heartrate_summary_features + +def extractHRFeaturesFromIntradayData(heartrate_intraday_data, features, day_segment): + heartrate_intraday_features = pd.DataFrame(columns=["local_date"] + ["heartrate_" + day_segment + "_" + x for x in features]) + if not heartrate_intraday_data.empty: + device_id = heartrate_intraday_data["device_id"][0] + num_rows_per_minute = heartrate_intraday_data.groupby(["local_date", "local_hour", "local_minute"]).count().mean()["device_id"] + if day_segment != "daily": + heartrate_intraday_data = heartrate_intraday_data[heartrate_intraday_data["local_day_segment"] == day_segment] + + if not heartrate_intraday_data.empty: + heartrate_intraday_features = pd.DataFrame() + + # get stats of heartrate + if "maxhr" in features: + heartrate_intraday_features["heartrate_" + day_segment + "_maxhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].max() + if "minhr" in features: + heartrate_intraday_features["heartrate_" + day_segment + "_minhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].min() + if "avghr" in features: + heartrate_intraday_features["heartrate_" + day_segment + "_avghr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].mean() + if "medianhr" in features: + heartrate_intraday_features["heartrate_" + day_segment + "_medianhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].median() + if "modehr" in features: + heartrate_intraday_features["heartrate_" + day_segment + "_modehr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].agg(lambda x: pd.Series.mode(x)[0]) + if "stdhr" in features: + heartrate_intraday_features["heartrate_" + day_segment + "_stdhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].std() + if "diffmaxmodehr" in features: + heartrate_intraday_features["heartrate_" + day_segment + "_diffmaxmodehr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].max() - heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].agg(lambda x: pd.Series.mode(x)[0]) + if "diffminmodehr" in features: + heartrate_intraday_features["heartrate_" + day_segment + "_diffminmodehr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].agg(lambda x: pd.Series.mode(x)[0]) - heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].min() + if "entropyhr" in features: + heartrate_intraday_features["heartrate_" + day_segment + "_entropyhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].agg(entropy) + + # get number of minutes in each heart rate zone + for feature_name in list(set(["lengthoutofrange", "lengthfatburn", "lengthcardio", "lengthpeak"]) & set(features)): + heartrate_zone = heartrate_intraday_data[heartrate_intraday_data["heartrate_zone"] == feature_name[6:]] + heartrate_intraday_features["heartrate_" + day_segment + "_" + feature_name] = heartrate_zone.groupby(["local_date"])["device_id"].count() / num_rows_per_minute + heartrate_intraday_features.fillna(value={"heartrate_" + day_segment + "_" + feature_name: 0}, inplace=True) + heartrate_intraday_features.reset_index(inplace=True) + + return heartrate_intraday_features + +def base_fitbit_heartrate_features(heartrate_summary_data, heartrate_intraday_data, day_segment, requested_summary_features, requested_intraday_features): + # name of the features this function can compute + base_summary_features_names = ["restinghr", "caloriesoutofrange", "caloriesfatburn", "caloriescardio", "caloriespeak"] + base_intraday_features_names = ["maxhr", "minhr", "avghr", "medianhr", "modehr", "stdhr", "diffmaxmodehr", "diffminmodehr", "entropyhr", "lengthoutofrange", "lengthfatburn", "lengthcardio", "lengthpeak"] + # the subset of requested features this function can compute + summary_features_to_compute = list(set(requested_summary_features) & set(base_summary_features_names)) + intraday_features_to_compute = list(set(requested_intraday_features) & set(base_intraday_features_names)) + + heartrate_intraday_features = extractHRFeaturesFromIntradayData(heartrate_intraday_data, intraday_features_to_compute, day_segment) + if not heartrate_summary_data.empty and day_segment == "daily" and summary_features_to_compute != []: + heartrate_summary_features = extractHRFeaturesFromSummaryData(heartrate_summary_data, summary_features_to_compute) + heartrate_features = heartrate_intraday_features.merge(heartrate_summary_features, on=["local_date"], how="outer") + else: + heartrate_features = heartrate_intraday_features + + return heartrate_features diff --git a/src/features/fitbit_heartrate_features.py b/src/features/fitbit_heartrate_features.py index 199f4ea1..61bad1ee 100644 --- a/src/features/fitbit_heartrate_features.py +++ b/src/features/fitbit_heartrate_features.py @@ -1,78 +1,16 @@ import pandas as pd -import numpy as np -from scipy.stats import entropy -import json - - -def extractHRFeaturesFromSummaryData(heartrate_summary_data, daily_features_from_summary_data): - heartrate_summary_features = pd.DataFrame() - if "restinghr" in daily_features_from_summary_data: - heartrate_summary_features["heartrate_daily_restinghr"] = heartrate_summary_data["heartrate_daily_restinghr"] - # calories features might be inaccurate: they depend on users' fitbit profile (weight, height, etc.) - if "caloriesoutofrange" in daily_features_from_summary_data: - heartrate_summary_features["heartrate_daily_caloriesoutofrange"] = heartrate_summary_data["heartrate_daily_caloriesoutofrange"] - if "caloriesfatburn" in daily_features_from_summary_data: - heartrate_summary_features["heartrate_daily_caloriesfatburn"] = heartrate_summary_data["heartrate_daily_caloriesfatburn"] - if "caloriescardio" in daily_features_from_summary_data: - heartrate_summary_features["heartrate_daily_caloriescardio"] = heartrate_summary_data["heartrate_daily_caloriescardio"] - if "caloriespeak" in daily_features_from_summary_data: - heartrate_summary_features["heartrate_daily_caloriespeak"] = heartrate_summary_data["heartrate_daily_caloriespeak"] - heartrate_summary_features.reset_index(inplace=True) - - return heartrate_summary_features - -def extractHRFeaturesFromIntradayData(heartrate_intraday_data, features): - heartrate_intraday_features = pd.DataFrame(columns=["local_date"] + ["heartrate_" + day_segment + "_" + x for x in features]) - if not heartrate_intraday_data.empty: - device_id = heartrate_intraday_data["device_id"][0] - num_rows_per_minute = heartrate_intraday_data.groupby(["local_date", "local_hour", "local_minute"]).count().mean()["device_id"] - if day_segment != "daily": - heartrate_intraday_data = heartrate_intraday_data[heartrate_intraday_data["local_day_segment"] == day_segment] - - if not heartrate_intraday_data.empty: - heartrate_intraday_features = pd.DataFrame() - - # get stats of heartrate - if "maxhr" in features: - heartrate_intraday_features["heartrate_" + day_segment + "_maxhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].max() - if "minhr" in features: - heartrate_intraday_features["heartrate_" + day_segment + "_minhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].min() - if "avghr" in features: - heartrate_intraday_features["heartrate_" + day_segment + "_avghr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].mean() - if "medianhr" in features: - heartrate_intraday_features["heartrate_" + day_segment + "_medianhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].median() - if "modehr" in features: - heartrate_intraday_features["heartrate_" + day_segment + "_modehr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].agg(lambda x: pd.Series.mode(x)[0]) - if "stdhr" in features: - heartrate_intraday_features["heartrate_" + day_segment + "_stdhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].std() - if "diffmaxmodehr" in features: - heartrate_intraday_features["heartrate_" + day_segment + "_diffmaxmodehr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].max() - heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].agg(lambda x: pd.Series.mode(x)[0]) - if "diffminmodehr" in features: - heartrate_intraday_features["heartrate_" + day_segment + "_diffminmodehr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].agg(lambda x: pd.Series.mode(x)[0]) - heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].min() - if "entropyhr" in features: - heartrate_intraday_features["heartrate_" + day_segment + "_entropyhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].agg(entropy) - - # get number of minutes in each heart rate zone - for feature_name in list(set(["lengthoutofrange", "lengthfatburn", "lengthcardio", "lengthpeak"]) & set(features)): - heartrate_zone = heartrate_intraday_data[heartrate_intraday_data["heartrate_zone"] == feature_name[6:]] - heartrate_intraday_features["heartrate_" + day_segment + "_" + feature_name] = heartrate_zone.groupby(["local_date"])["device_id"].count() / num_rows_per_minute - heartrate_intraday_features.fillna(value={"heartrate_" + day_segment + "_" + feature_name: 0}, inplace=True) - heartrate_intraday_features.reset_index(inplace=True) - - return heartrate_intraday_features - +from fitbit_heartrate.fitbit_heartrate_base import base_fitbit_heartrate_features heartrate_summary_data = pd.read_csv(snakemake.input["heartrate_summary_data"], index_col=["local_date"], parse_dates=["local_date"]) heartrate_intraday_data = pd.read_csv(snakemake.input["heartrate_intraday_data"], parse_dates=["local_date_time", "local_date"]) day_segment = snakemake.params["day_segment"] -features = snakemake.params["features"] -daily_features_from_summary_data = snakemake.params["daily_features_from_summary_data"] +requested_summary_features = snakemake.params["summary_features"] +requested_intraday_features = snakemake.params["intraday_features"] +heartrate_features = pd.DataFrame(columns=["local_date"]) -heartrate_intraday_features = extractHRFeaturesFromIntradayData(heartrate_intraday_data, features) -if not heartrate_summary_data.empty and day_segment == "daily" and daily_features_from_summary_data != []: - heartrate_summary_features = extractHRFeaturesFromSummaryData(heartrate_summary_data, daily_features_from_summary_data) - heartrate_features = heartrate_intraday_features.merge(heartrate_summary_features, on=["local_date"], how="outer") -else: - heartrate_features = heartrate_intraday_features +heartrate_features = heartrate_features.merge(base_fitbit_heartrate_features(heartrate_summary_data, heartrate_intraday_data, day_segment, requested_summary_features, requested_intraday_features), on="local_date", how="outer") + +requested_features = requested_summary_features + requested_intraday_features if day_segment == "daily" else requested_intraday_features +assert len(requested_features) + 1 == heartrate_features.shape[1], "The number of features in the output dataframe (=" + str(heartrate_features.shape[1]) + ") does not match the expected value (=" + str(len(requested_features)) + " + 1). Verify your fitbit heartrate feature extraction functions" heartrate_features.to_csv(snakemake.output[0], index=False)