diff --git a/Snakefile b/Snakefile index 3b57d270..5cfe9e44 100644 --- a/Snakefile +++ b/Snakefile @@ -235,8 +235,7 @@ for provider in config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"].keys(): for provider in config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"].keys(): if config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_raw.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_parsed.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_parsed_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/fitbit_heartrate_intraday_features/fitbit_heartrate_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_heartrate_intraday.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) diff --git a/docs/datastreams/fitbitjson-mysql.md b/docs/datastreams/fitbitjson-mysql.md index 4d654497..a4b4e1c2 100644 --- a/docs/datastreams/fitbitjson-mysql.md +++ b/docs/datastreams/fitbitjson-mysql.md @@ -60,6 +60,45 @@ If you want RAPIDS to process Fitbit sensor data using this stream, you will nee |a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"activities-heart":[{"dateTime":"2020-10-08","value":{"customHeartRateZones":[],"heartRateZones":[{"caloriesOut":1100.1120,"max":89,"min":30,"minutes":921,"name":"Out of Range"},{"caloriesOut":660.0012,"max":118,"min":82,"minutes":361,"name":"Fat Burn"},{"caloriesOut":23.7088,"max":142,"min":108,"minutes":3,"name":"Cardio"},{"caloriesOut":0,"max":221,"min":148,"minutes":0,"name":"Peak"}],"restingHeartRate":70}}],"activities-heart-intraday":{"dataset":[{"time":"00:00:00","value":77},{"time":"00:01:00","value":75},{"time":"00:02:00","value":73},...],"datasetInterval":1,"datasetType":"minute"}} |a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"activities-heart":[{"dateTime":"2020-10-09","value":{"customHeartRateZones":[],"heartRateZones":[{"caloriesOut":750.3615,"max":77,"min":30,"minutes":851,"name":"Out of Range"},{"caloriesOut":734.1516,"max":107,"min":77,"minutes":550,"name":"Fat Burn"},{"caloriesOut":131.8579,"max":130,"min":107,"minutes":29,"name":"Cardio"},{"caloriesOut":0,"max":220,"min":130,"minutes":0,"name":"Peak"}],"restingHeartRate":69}}],"activities-heart-intraday":{"dataset":[{"time":"00:00:00","value":90},{"time":"00:01:00","value":89},{"time":"00:02:00","value":88},...],"datasetInterval":1,"datasetType":"minute"}} +??? info "FITBIT_HEARTRATE_INTRADAY" + + **RAPIDS_COLUMN_MAPPINGS** + + | RAPIDS column | Stream column | + |-----------------|-----------------| + | LOCAL_DATE_TIME | FLAG_TO_MUTATE | + | DEVICE_ID | device_id | + | HEARTRATE | FLAG_TO_MUTATE | + | HEARTRATE_ZONE | FLAG_TO_MUTATE | + + + **MUTATION** + + - **COLUMN_MAPPINGS** + + | Script column | Stream column | + |-----------------|-----------------| + | JSON_FITBIT_COLUMN | fitbit_data | + + - **SCRIPTS** + + ```bash + src/data/streams/mutations/fitbit/parse_heartrate_intraday_json.py + ``` + + !!! note + All columns except `DEVICE_ID` are parsed from `JSON_FITBIT_COLUMN`. `JSON_FITBIT_COLUMN` is a string column containing the JSON objects returned by Fitbit's API. See an example of the raw data RAPIDS expects for this data stream: + + + ??? "Example of the raw data RAPIDS expects for this data stream" + + |device_id |fitbit_data | + |---------------------------------------- |--------------------------------------------------------- | + |a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"activities-heart":[{"dateTime":"2020-10-07","value":{"customHeartRateZones":[],"heartRateZones":[{"caloriesOut":1200.6102,"max":88,"min":31,"minutes":1058,"name":"Out of Range"},{"caloriesOut":760.3020,"max":120,"min":86,"minutes":366,"name":"Fat Burn"},{"caloriesOut":15.2048,"max":146,"min":120,"minutes":2,"name":"Cardio"},{"caloriesOut":0,"max":221,"min":148,"minutes":0,"name":"Peak"}],"restingHeartRate":72}}],"activities-heart-intraday":{"dataset":[{"time":"00:00:00","value":68},{"time":"00:01:00","value":67},{"time":"00:02:00","value":67},...],"datasetInterval":1,"datasetType":"minute"}} + |a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"activities-heart":[{"dateTime":"2020-10-08","value":{"customHeartRateZones":[],"heartRateZones":[{"caloriesOut":1100.1120,"max":89,"min":30,"minutes":921,"name":"Out of Range"},{"caloriesOut":660.0012,"max":118,"min":82,"minutes":361,"name":"Fat Burn"},{"caloriesOut":23.7088,"max":142,"min":108,"minutes":3,"name":"Cardio"},{"caloriesOut":0,"max":221,"min":148,"minutes":0,"name":"Peak"}],"restingHeartRate":70}}],"activities-heart-intraday":{"dataset":[{"time":"00:00:00","value":77},{"time":"00:01:00","value":75},{"time":"00:02:00","value":73},...],"datasetInterval":1,"datasetType":"minute"}} + |a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"activities-heart":[{"dateTime":"2020-10-09","value":{"customHeartRateZones":[],"heartRateZones":[{"caloriesOut":750.3615,"max":77,"min":30,"minutes":851,"name":"Out of Range"},{"caloriesOut":734.1516,"max":107,"min":77,"minutes":550,"name":"Fat Burn"},{"caloriesOut":131.8579,"max":130,"min":107,"minutes":29,"name":"Cardio"},{"caloriesOut":0,"max":220,"min":130,"minutes":0,"name":"Peak"}],"restingHeartRate":69}}],"activities-heart-intraday":{"dataset":[{"time":"00:00:00","value":90},{"time":"00:01:00","value":89},{"time":"00:02:00","value":88},...],"datasetInterval":1,"datasetType":"minute"}} + + ??? info "FITBIT_STEPS_SUMMARY" **RAPIDS_COLUMN_MAPPINGS** diff --git a/docs/datastreams/mandatory-fitbit-format.md b/docs/datastreams/mandatory-fitbit-format.md index ab86601f..6394cff2 100644 --- a/docs/datastreams/mandatory-fitbit-format.md +++ b/docs/datastreams/mandatory-fitbit-format.md @@ -15,6 +15,16 @@ This is a description of the format RAPIDS needs to process data for the followi | HEARTRATE_DAILY_CALORIESCARDIO | Calories spent while heartrate was inside the cardio [zone](https://help.fitbit.com/articles/en_US/Help_article/1565.htm#) | | HEARTRATE_DAILY_CALORIESPEAK | Calories spent while heartrate was inside the peak [zone](https://help.fitbit.com/articles/en_US/Help_article/1565.htm#) | +??? info "FITBIT_HEARTRATE_INTRADAY" + + | RAPIDS column | Description | + |-----------------|-----------------| + | TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged | + | LOCAL_DATE_TIME | Date time string with format `yyyy-mm-dd hh:mm:ss` | + | DEVICE_ID | A string that uniquely identifies a device | + | HEARTRATE | Intraday heartrate | + | HEARTRATE_ZONE | Heartrate [zone](https://help.fitbit.com/articles/en_US/Help_article/1565.htm#) that HEARTRATE belongs to. It is based on the heartrate zone ranges of each device | + ??? info "FITBIT_STEPS_SUMMARY" | RAPIDS column | Description | diff --git a/rules/features.smk b/rules/features.smk index 397d8ed1..0fad5212 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -558,7 +558,7 @@ rule fitbit_heartrate_summary_r_features: rule fitbit_heartrate_intraday_python_features: input: - sensor_data = "data/raw/{pid}/fitbit_heartrate_intraday_parsed_with_datetime.csv", + sensor_data = "data/raw/{pid}/fitbit_heartrate_intraday_with_datetime.csv", time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"][wildcards.provider_key.upper()], @@ -571,7 +571,7 @@ rule fitbit_heartrate_intraday_python_features: rule fitbit_heartrate_intraday_r_features: input: - sensor_data = "data/raw/{pid}/fitbit_heartrate_intraday_parsed_with_datetime.csv", + sensor_data = "data/raw/{pid}/fitbit_heartrate_intraday_with_datetime.csv", time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"][wildcards.provider_key.upper()], diff --git a/src/data/streams/fitbitjson_mysql/format.yaml b/src/data/streams/fitbitjson_mysql/format.yaml index 31ea979d..a9fd88da 100644 --- a/src/data/streams/fitbitjson_mysql/format.yaml +++ b/src/data/streams/fitbitjson_mysql/format.yaml @@ -14,6 +14,20 @@ FITBIT_HEARTRATE_SUMMARY: SCRIPTS: # List any python or r scripts that mutate your raw data - src/data/streams/mutations/fitbit/parse_heartrate_summary_json.py +FITBIT_HEARTRATE_INTRADAY: + RAPIDS_COLUMN_MAPPINGS: + TIMESTAMP: FLAG_TO_MUTATE + DEVICE_ID: device_id + LOCAL_DATE_TIME: FLAG_TO_MUTATE + HEARTRATE: FLAG_TO_MUTATE + HEARTRATE_ZONE: FLAG_TO_MUTATE + MUTATION: + COLUMN_MAPPINGS: + JSON_FITBIT_COLUMN: fitbit_data # text column with JSON objects + SCRIPTS: # List any python or r scripts that mutate your raw data + - src/data/streams/mutations/fitbit/parse_heartrate_intraday_json.py + + FITBIT_STEPS_SUMMARY: RAPIDS_COLUMN_MAPPINGS: TIMESTAMP: FLAG_TO_MUTATE diff --git a/src/data/streams/mutations/fitbit/parse_heartrate_intraday_json.py b/src/data/streams/mutations/fitbit/parse_heartrate_intraday_json.py new file mode 100644 index 00000000..0ff3764d --- /dev/null +++ b/src/data/streams/mutations/fitbit/parse_heartrate_intraday_json.py @@ -0,0 +1,83 @@ +import json +import pandas as pd +from datetime import datetime + + +HR_INTRADAY_COLUMNS = ("device_id", + "heartrate", + "heartrate_zone", + "local_date_time", + "timestamp") + +def parseHeartrateZones(heartrate_data): + # Get the range of heartrate zones: outofrange, fatburn, cardio, peak + # refer to: https://help.fitbit.com/articles/en_US/Help_article/1565 + + heartrate_fitbit_data = heartrate_data["activities-heart"][0] + # API Version X: not sure the exact version + if "heartRateZones" in heartrate_fitbit_data: + heartrate_zones = heartrate_fitbit_data["heartRateZones"] + # API VERSION Y: not sure the exact version + elif "value" in heartrate_fitbit_data: + heartrate_zones = heartrate_fitbit_data["value"]["heartRateZones"] + else: + raise ValueError("Heartrate zone are stored in an unknown format, this could mean Fitbit's heartrate API changed") + + heartrate_zones_range = {} + for hrzone in heartrate_zones: + heartrate_zones_range[hrzone["name"].lower().replace(" ", "")] = [hrzone["min"], hrzone["max"]] + return heartrate_zones_range + + +def parseHeartrateIntradayData(records_intraday, dataset, device_id, curr_date, heartrate_zones_range): + for data in dataset: + d_time = datetime.strptime(data["time"], '%H:%M:%S').time() + d_datetime = datetime.combine(curr_date, d_time) + d_hr = data["value"] + + # Get heartrate zone by range: min <= heartrate < max + d_hrzone = None + for hrzone, hrrange in heartrate_zones_range.items(): + if d_hr >= hrrange[0] and d_hr < hrrange[1]: + d_hrzone = hrzone + break + + row_intraday = (device_id, + d_hr, d_hrzone, + d_datetime, + 0) + + records_intraday.append(row_intraday) + return records_intraday + + + +def parseHeartrateData(heartrate_data): + if heartrate_data.empty: + return pd.DataFrame(columns=HR_INTRADAY_COLUMNS) + + device_id = heartrate_data["device_id"].iloc[0] + records_intraday = [] + + + # Parse JSON into individual records + for record in heartrate_data.json_fitbit_column: + record = json.loads(record) # Parse text into JSON + if "activities-heart" in record: + heartrate_zones_range = parseHeartrateZones(record) + curr_date = datetime.strptime(record["activities-heart"][0]["dateTime"], "%Y-%m-%d") + + if "activities-heart-intraday" in record: + dataset = record["activities-heart-intraday"]["dataset"] + records_intraday = parseHeartrateIntradayData(records_intraday, dataset, device_id, curr_date, heartrate_zones_range) + + parsed_data = pd.DataFrame(data=records_intraday, columns=HR_INTRADAY_COLUMNS) + return parsed_data + + + +def main(json_raw, stream_parameters): + parsed_data = parseHeartrateData(json_raw) + parsed_data["timestamp"] = 0 # this column is added at readable_datetime.R because we neeed to take into account multiple timezones + parsed_data['local_date_time'] = parsed_data['local_date_time'].dt.strftime('%Y-%m-%d %H:%M:%S') + return(parsed_data) diff --git a/src/data/streams/mutations/fitbit/parse_heartrate_summary_json.py b/src/data/streams/mutations/fitbit/parse_heartrate_summary_json.py index daab3b5b..e6bac1c8 100644 --- a/src/data/streams/mutations/fitbit/parse_heartrate_summary_json.py +++ b/src/data/streams/mutations/fitbit/parse_heartrate_summary_json.py @@ -1,8 +1,6 @@ -import yaml, json, sys +import json import pandas as pd -import numpy as np -from datetime import datetime, timezone -from math import trunc +from datetime import datetime HR_SUMMARY_COLUMNS = ("device_id", diff --git a/src/data/streams/rapids_columns.yaml b/src/data/streams/rapids_columns.yaml index 6aea61b3..7629b1d6 100644 --- a/src/data/streams/rapids_columns.yaml +++ b/src/data/streams/rapids_columns.yaml @@ -102,6 +102,13 @@ FITBIT_HEARTRATE_SUMMARY: - HEARTRATE_DAILY_CALORIESCARDIO - HEARTRATE_DAILY_CALORIESPEAK +FITBIT_HEARTRATE_INTRADAY: + - TIMESTAMP + - DEVICE_ID + - LOCAL_DATE_TIME + - HEARTRATE + - HEARTRATE_ZONE + FITBIT_STEPS_SUMMARY: - TIMESTAMP - DEVICE_ID