Parse Fitbit summary and intraday data; Extract Fitbit daily features from summary data

pull/95/head
Meng Li 2020-05-15 17:51:00 -04:00
parent d07bb9ed5f
commit 915bdd04b1
11 changed files with 476 additions and 233 deletions

View File

@ -59,15 +59,19 @@ rule all:
expand("data/processed/{pid}/applications_foreground_{day_segment}.csv",
pid = config["PIDS"],
day_segment = config["APPLICATIONS_FOREGROUND"]["DAY_SEGMENTS"]),
expand("data/raw/{pid}/fitbit_{fitbit_sensor}_with_datetime.csv",
expand("data/raw/{pid}/fitbit_{fitbit_sensor}_{fitbit_data_type}_with_datetime.csv",
pid=config["PIDS"],
fitbit_sensor=config["FITBIT_SENSORS"]),
fitbit_sensor=config["FITBIT_SENSORS"],
fitbit_data_type=config["FITBIT_DATA_TYPE"]),
expand("data/processed/{pid}/fitbit_heartrate_{day_segment}.csv",
pid = config["PIDS"],
day_segment = config["HEARTRATE"]["DAY_SEGMENTS"]),
expand("data/processed/{pid}/fitbit_step_{day_segment}.csv",
pid = config["PIDS"],
day_segment = config["STEP"]["DAY_SEGMENTS"]),
expand("data/processed/{pid}/fitbit_sleep_{day_segment}.csv",
pid = config["PIDS"],
day_segment = config["SLEEP"]["DAY_SEGMENTS"]),
expand("data/processed/{pid}/wifi_{segment}.csv",
pid=config["PIDS"],
segment = config["WIFI"]["DAY_SEGMENTS"]),

View File

@ -3,6 +3,7 @@ SENSORS: [applications_crashes, applications_foreground, applications_notificati
FITBIT_TABLE: [fitbit_data]
FITBIT_SENSORS: [heartrate, steps, sleep, calories]
FITBIT_DATA_TYPE: [summary, intraday]
# Participants to include in the analysis
# You must create a file for each participant
@ -114,6 +115,7 @@ APPLICATIONS_FOREGROUND:
HEARTRATE:
DAY_SEGMENTS: *day_segments
FEATURES: ["maxhr", "minhr", "avghr", "medianhr", "modehr", "stdhr", "diffmaxmodehr", "diffminmodehr", "entropyhr", "lengthoutofrange", "lengthfatburn", "lengthcardio", "lengthpeak"]
DAILY_FEATURES_FROM_SUMMARY_DATA: ["restinghr"] # calories related features might be inaccurate: ["caloriesoutofrange", "caloriesfatburn", "caloriescardio", "caloriespeak"]
STEP:
DAY_SEGMENTS: *day_segments
@ -124,6 +126,11 @@ STEP:
THRESHOLD_ACTIVE_BOUT: 10 # steps
INCLUDE_ZERO_STEP_ROWS: True
SLEEP:
DAY_SEGMENTS: *day_segments
SLEEP_TYPES: ["main", "nap", "all"]
DAILY_FEATURES_FROM_SUMMARY_DATA: ["sumdurationafterwakeup", "sumdurationasleep", "sumdurationawake", "sumdurationtofallasleep", "sumdurationinbed", "avgefficiency", "countepisode"]
WIFI:
DAY_SEGMENTS: *day_segments
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
@ -133,7 +140,7 @@ PARAMS_FOR_ANALYSIS:
SOURCES: &sources ["phone_features", "fitbit_features", "phone_fitbit_features"]
DAY_SEGMENTS: *day_segments
PHONE_FEATURES: [accelerometer, applications_foreground, battery, call_incoming, call_missed, call_outgoing, activity_recognition, light, location_barnett, screen, sms_received, sms_sent]
FITBIT_FEATURES: [fitbit_heartrate, fitbit_step]
FITBIT_FEATURES: [fitbit_heartrate, fitbit_step, fitbit_sleep]
PHONE_FITBIT_FEATURES: "" # This array is merged in the input_merge_features_of_single_participant function in models.snakefile
DEMOGRAPHIC_FEATURES: [age, gender, inpatientdays]
CATEGORICAL_DEMOGRAPHIC_FEATURES: ["gender"]

View File

@ -167,12 +167,25 @@ rule applications_foreground_features:
script:
"../src/features/applications_foreground_features.py"
rule wifi_features:
input:
"data/raw/{pid}/wifi_with_datetime.csv"
params:
day_segment = "{day_segment}",
features = config["WIFI"]["FEATURES"]
output:
"data/processed/{pid}/wifi_{day_segment}.csv"
script:
"../src/features/wifi_features.R"
rule fitbit_heartrate_features:
input:
"data/raw/{pid}/fitbit_heartrate_with_datetime.csv",
heartrate_summary_data = "data/raw/{pid}/fitbit_heartrate_summary_with_datetime.csv",
heartrate_intraday_data = "data/raw/{pid}/fitbit_heartrate_intraday_with_datetime.csv"
params:
day_segment = "{day_segment}",
features = config["HEARTRATE"]["FEATURES"],
daily_features_from_summary_data = config["HEARTRATE"]["DAILY_FEATURES_FROM_SUMMARY_DATA"]
output:
"data/processed/{pid}/fitbit_heartrate_{day_segment}.csv"
script:
@ -193,13 +206,15 @@ rule fitbit_step_features:
script:
"../src/features/fitbit_step_features.py"
rule wifi_features:
rule fitbit_sleep_features:
input:
"data/raw/{pid}/wifi_with_datetime.csv"
sleep_summary_data = "data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv",
sleep_intraday_data = "data/raw/{pid}/fitbit_sleep_intraday_with_datetime.csv"
params:
day_segment = "{day_segment}",
features = config["WIFI"]["FEATURES"]
sleep_types = config["SLEEP"]["SLEEP_TYPES"],
daily_features_from_summary_data = config["SLEEP"]["DAILY_FEATURES_FROM_SUMMARY_DATA"]
output:
"data/processed/{pid}/wifi_{day_segment}.csv"
"data/processed/{pid}/fitbit_sleep_{day_segment}.csv"
script:
"../src/features/wifi_features.R"
"../src/features/fitbit_sleep_features.py"

View File

@ -99,7 +99,8 @@ rule fitbit_with_datetime:
local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"],
fitbit_sensor = "{fitbit_sensor}"
output:
"data/raw/{pid}/fitbit_{fitbit_sensor}_with_datetime.csv"
summary_data = "data/raw/{pid}/fitbit_{fitbit_sensor}_summary_with_datetime.csv",
intraday_data = "data/raw/{pid}/fitbit_{fitbit_sensor}_intraday_with_datetime.csv"
script:
"../src/data/fitbit_readable_datetime.py"

View File

@ -0,0 +1,35 @@
import json
import pandas as pd
from datetime import datetime
CALORIES_INTRADAY_COLUMNS = ("device_id",
"level", "mets", "value",
"local_date_time", "local_date", "local_month", "local_day",
"local_day_of_week", "local_time", "local_hour", "local_minute",
"local_day_segment")
def parseCaloriesData(calories_data, HOUR2EPOCH):
if calories_data.empty:
return pd.DataFrame(), pd.DataFrame(columns=CALORIES_INTRADAY_COLUMNS)
device_id = calories_data["device_id"].iloc[0]
records_intraday = []
# Parse JSON into individual records
for record in calories_data.fitbit_data:
record = json.loads(record) # Parse text into JSON
curr_date = datetime.strptime(
record["activities-calories"][0]["dateTime"], "%Y-%m-%d")
dataset = record["activities-calories-intraday"]["dataset"]
for data in dataset:
d_time = datetime.strptime(data["time"], '%H:%M:%S').time()
d_datetime = datetime.combine(curr_date, d_time)
row_intraday = (device_id,
data["level"], data["mets"], data["value"],
d_datetime, d_datetime.date(), d_datetime.month, d_datetime.day,
d_datetime.weekday(), d_datetime.time(), d_datetime.hour, d_datetime.minute,
HOUR2EPOCH[d_datetime.hour])
records_intraday.append(row_intraday)
return pd.DataFrame(), pd.DataFrame(data=records_intraday, columns=CALORIES_INTRADAY_COLUMNS)

View File

@ -0,0 +1,114 @@
import json
import pandas as pd
from datetime import datetime
HR_SUMMARY_COLUMNS = ("device_id",
"local_date",
"heartrate_daily_restinghr",
"heartrate_daily_caloriesoutofrange",
"heartrate_daily_caloriesfatburn",
"heartrate_daily_caloriescardio",
"heartrate_daily_caloriespeak")
HR_INTRADAY_COLUMNS = ("device_id",
"heartrate", "heartrate_zone",
"local_date_time", "local_date", "local_month", "local_day",
"local_day_of_week", "local_time", "local_hour", "local_minute",
"local_day_segment")
def parseHeartrateZones(heartrate_data):
# Get the range of heartrate zones: outofrange, fatburn, cardio, peak
# refer to: https://help.fitbit.com/articles/en_US/Help_article/1565
heartrate_fitbit_data = json.loads(heartrate_data["fitbit_data"].iloc[0])["activities-heart"][0]
# API Version X: not sure the exact version
if "heartRateZones" in heartrate_fitbit_data:
heartrate_zones = heartrate_fitbit_data["heartRateZones"]
# API VERSION Y: not sure the exact version
elif "value" in heartrate_fitbit_data:
heartrate_zones = heartrate_fitbit_data["value"]["heartRateZones"]
else:
raise ValueError("Heartrate zone are stored in an unkown format, this could mean Fitbit's heartrate API changed")
heartrate_zones_range = {}
for hrzone in heartrate_zones:
heartrate_zones_range[hrzone["name"].lower().replace(" ", "")] = [hrzone["min"], hrzone["max"]]
return heartrate_zones_range
def parseHeartrateSummaryData(record_summary, device_id, curr_date):
# API Version X: not sure the exact version
if "heartRateZones" in record_summary:
heartrate_zones = record_summary["heartRateZones"]
d_resting_heartrate = record_summary["value"] if "value" in record_summary else None
# API VERSION Y: not sure the exact version
elif "value" in record_summary:
heartrate_zones = record_summary["value"]["heartRateZones"]
d_resting_heartrate = record_summary["value"]["restingHeartRate"] if "restingHeartRate" in record_summary["value"] else None
else:
ValueError("Heartrate zone are stored in an unkown format, this could mean Fitbit's heartrate API changed")
if "caloriesOut" in heartrate_zones[0]:
d_calories_outofrange = heartrate_zones[0]["caloriesOut"]
d_calories_fatburn = heartrate_zones[1]["caloriesOut"]
d_calories_cardio = heartrate_zones[2]["caloriesOut"]
d_calories_peak = heartrate_zones[3]["caloriesOut"]
else:
d_calories_outofrange, d_calories_fatburn, d_calories_cardio, d_calories_peak = None, None, None, None
row_summary = (device_id,
curr_date,
d_resting_heartrate,
d_calories_outofrange,
d_calories_fatburn,
d_calories_cardio,
d_calories_peak)
return row_summary
def parseHeartrateIntradayData(records_intraday, dataset, device_id, curr_date, heartrate_zones_range, HOUR2EPOCH):
for data in dataset:
d_time = datetime.strptime(data["time"], '%H:%M:%S').time()
d_datetime = datetime.combine(curr_date, d_time)
d_hr = data["value"]
# Get heartrate zone by range: min <= heartrate < max
d_hrzone = None
for hrzone, hrrange in heartrate_zones_range.items():
if d_hr >= hrrange[0] and d_hr < hrrange[1]:
d_hrzone = hrzone
break
row_intraday = (device_id,
d_hr, d_hrzone,
d_datetime, d_datetime.date(), d_datetime.month, d_datetime.day,
d_datetime.weekday(), d_datetime.time(), d_datetime.hour, d_datetime.minute,
HOUR2EPOCH[d_datetime.hour])
records_intraday.append(row_intraday)
return records_intraday
def parseHeartrateData(heartrate_data, HOUR2EPOCH):
if heartrate_data.empty:
return pd.DataFrame(columns=HR_COLUMNS)
device_id = heartrate_data["device_id"].iloc[0]
records_summary, records_intraday = [], []
heartrate_zones_range = parseHeartrateZones(heartrate_data)
# Parse JSON into individual records
for record in heartrate_data.fitbit_data:
record = json.loads(record) # Parse text into JSON
curr_date = datetime.strptime(record["activities-heart"][0]["dateTime"], "%Y-%m-%d")
record_summary = record["activities-heart"][0]
row_summary = parseHeartrateSummaryData(record_summary, device_id, curr_date)
records_summary.append(row_summary)
dataset = record["activities-heart-intraday"]["dataset"]
records_intraday = parseHeartrateIntradayData(records_intraday, dataset, device_id, curr_date, heartrate_zones_range, HOUR2EPOCH)
return pd.DataFrame(data=records_summary, columns=HR_SUMMARY_COLUMNS), pd.DataFrame(data=records_intraday, columns=HR_INTRADAY_COLUMNS)

View File

@ -0,0 +1,109 @@
import json
import pandas as pd
from datetime import datetime
SLEEP_CODE2LEVEL = ["asleep", "restless", "awake"]
SLEEP_SUMMARY_COLUMNS_V1_2 = ("device_id", "efficiency",
"minutes_after_wakeup", "minutes_asleep", "minutes_awake", "minutes_to_fall_asleep", "minutes_in_bed",
"is_main_sleep", "type",
"local_start_date_time", "local_end_date_time",
"local_start_date", "local_end_date",
"local_start_day_segment", "local_end_day_segment")
SLEEP_SUMMARY_COLUMNS_V1 = SLEEP_SUMMARY_COLUMNS_V1_2 + ("count_awake", "duration_awake", "count_awakenings", "count_restless", "duration_restless")
SLEEP_INTRADAY_COLUMNS = ("device_id",
# For "classic" type, original_level is one of {"awake", "restless", "asleep"}
# For "stages" type, original_level is one of {"wake", "deep", "light", "rem"}
"original_level",
# For "classic" type, unified_level is one of {0, 1} where 0: awake {"awake" + "restless"}, 1: asleep {"asleep"}
# For "stages" type, unified_level is one of {0, 1} where 0: awake {"wake"}, 1: asleep {"deep" + "light" + "rem"}
"unified_level",
# one of {0, 1} where 0: nap, 1: main sleep
"is_main_sleep",
# one of {"classic", "stages"}
"type",
"local_date_time", "local_date", "local_month", "local_day",
"local_day_of_week", "local_time", "local_hour", "local_minute",
"local_day_segment")
# Parse one record for sleep API version 1
def parseOneRecordForV1(record, device_id, d_is_main_sleep, records_summary, records_intraday, HOUR2EPOCH):
# Summary data
sleep_record_type = "classic"
d_start_datetime = datetime.strptime(record["startTime"][:18], "%Y-%m-%dT%H:%M:%S")
d_end_datetime = datetime.strptime(record["endTime"][:18], "%Y-%m-%dT%H:%M:%S")
row_summary = (device_id, record["efficiency"],
record["minutesAfterWakeup"], record["minutesAsleep"], record["minutesAwake"], record["minutesToFallAsleep"], record["timeInBed"],
d_is_main_sleep, sleep_record_type,
d_start_datetime, d_end_datetime,
d_start_datetime.date(), d_end_datetime.date(),
HOUR2EPOCH[d_start_datetime.hour], HOUR2EPOCH[d_end_datetime.hour],
record["awakeCount"], record["awakeDuration"], record["awakeningsCount"],
record["restlessCount"], record["restlessDuration"])
records_summary.append(row_summary)
# Intraday data
start_date = d_start_datetime.date()
end_date = d_end_datetime.date()
is_before_midnight = True
curr_date = start_date
for data in record["minuteData"]:
# For overnight episodes, use end_date once we are over midnight
d_time = datetime.strptime(data["dateTime"], '%H:%M:%S').time()
if is_before_midnight and d_time.hour == 0:
curr_date = end_date
d_datetime = datetime.combine(curr_date, d_time)
# API 1.2 stores original_level as strings, so we convert original_levels of API 1 to strings too
# (1: "asleep", 2: "restless", 3: "awake")
d_original_level = SLEEP_CODE2LEVEL[int(data["value"])-1]
# unified_level summarises original_level (we came up with this classification)
# 0 is awake, 1 is asleep
# {"awake" + "restless"} are set to 0 and {"asleep"} is set to 1
d_unified_level = 0 if d_original_level == "awake" or d_original_level == "restless" else 1
row_intraday = (device_id,
d_original_level, d_unified_level, d_is_main_sleep, sleep_record_type,
d_datetime, d_datetime.date(), d_datetime.month, d_datetime.day,
d_datetime.weekday(), d_datetime.time(), d_datetime.hour, d_datetime.minute,
HOUR2EPOCH[d_datetime.hour])
records_intraday.append(row_intraday)
return records_summary, records_intraday
# Parse one record for sleep API version 1.2
def parseOneRecordForV12(record, d_is_main_sleep, records_summary, records_intraday):
return None
def parseSleepData(sleep_data, HOUR2EPOCH):
if sleep_data.empty:
return pd.DataFrame(columns=SLEEP_SUMMARY_COLUMNS_V1), pd.DataFrame(columns=SLEEP_INTRADAY_COLUMNS)
device_id = sleep_data["device_id"].iloc[0]
records_summary, records_intraday = [], []
# Parse JSON into individual records
for multi_record in sleep_data.fitbit_data:
for record in json.loads(multi_record)["sleep"]:
# Whether the sleep episode is nap (0) or main sleep (1)
d_is_main_sleep = 1 if record["isMainSleep"] else 0
# For sleep API version 1
if "awakeCount" in record:
SLEEP_SUMMARY_COLUMNS = SLEEP_SUMMARY_COLUMNS_V1
records_summary, records_intraday = parseOneRecordForV1(record, device_id, d_is_main_sleep, records_summary, records_intraday, HOUR2EPOCH)
# For sleep API version 1.2
else:
SLEEP_SUMMARY_COLUMNS = SLEEP_SUMMARY_COLUMNS_V1_2
raise ValueError("Sleep data for API v1.2 is not supported yet.")
return pd.DataFrame(data=records_summary, columns=SLEEP_SUMMARY_COLUMNS), pd.DataFrame(data=records_intraday, columns=SLEEP_INTRADAY_COLUMNS)

View File

@ -0,0 +1,35 @@
import json
import pandas as pd
from datetime import datetime
STEPS_INTRADAY_COLUMNS = ("device_id",
"steps",
"local_date_time", "local_date", "local_month", "local_day",
"local_day_of_week", "local_time", "local_hour", "local_minute",
"local_day_segment")
def parseStepsData(steps_data, HOUR2EPOCH):
if steps_data.empty:
return pd.DataFrame(), pd.DataFrame(columns=STEPS_COLUMNS)
device_id = steps_data["device_id"].iloc[0]
records_intraday = []
# Parse JSON into individual records
for record in steps_data.fitbit_data:
record = json.loads(record) # Parse text into JSON
curr_date = datetime.strptime(
record["activities-steps"][0]["dateTime"], "%Y-%m-%d")
dataset = record["activities-steps-intraday"]["dataset"]
for data in dataset:
d_time = datetime.strptime(data["time"], '%H:%M:%S').time()
d_datetime = datetime.combine(curr_date, d_time)
row_intraday = (device_id,
data["value"],
d_datetime, d_datetime.date(), d_datetime.month, d_datetime.day,
d_datetime.weekday(), d_datetime.time(), d_datetime.hour, d_datetime.minute,
HOUR2EPOCH[d_datetime.hour])
records_intraday.append(row_intraday)
return pd.DataFrame(), pd.DataFrame(data=records_intraday, columns=STEPS_INTRADAY_COLUMNS)

View File

@ -1,6 +1,10 @@
import pandas as pd
import pytz, json
from datetime import datetime
from fitbit_parse_sensors.fitbit_parse_heartrate import parseHeartrateData
from fitbit_parse_sensors.fitbit_parse_sleep import parseSleepData
from fitbit_parse_sensors.fitbit_parse_steps import parseStepsData
from fitbit_parse_sensors.fitbit_parse_calories import parseCaloriesData
NIGHT = "night"
@ -10,30 +14,6 @@ EVENING = "evening"
HOUR2EPOCH = [NIGHT] * 6 + [MORNING] * 6 + [AFTERNOON] * 6 + [EVENING] * 6
HR_COLUMNS = ("device_id",
"heartrate", "heartrate_zone",
"local_date_time", "local_date", "local_month", "local_day",
"local_day_of_week", "local_time", "local_hour", "local_minute",
"local_day_segment")
SLEEP_COLUMNS = ("device_id",
"sleep", # 1: "asleep", 2: "restless", or 3: "awake"
"local_date_time", "local_date", "local_month", "local_day",
"local_day_of_week", "local_time", "local_hour", "local_minute",
"local_day_segment")
STEPS_COLUMNS = ("device_id",
"steps",
"local_date_time", "local_date", "local_month", "local_day",
"local_day_of_week", "local_time", "local_hour", "local_minute",
"local_day_segment")
CALORIES_COLUMNS = ("device_id",
"level", "mets", "value",
"local_date_time", "local_date", "local_month", "local_day",
"local_day_of_week", "local_time", "local_hour", "local_minute",
"local_day_segment")
def drop_duplicates(data, local_timezone):
"""
Data is pulled in intraday manner. Since data will be duplicated until the
@ -47,160 +27,6 @@ def drop_duplicates(data, local_timezone):
return data
def parse_steps_data(steps_data):
if steps_data.empty:
return pd.DataFrame(columns=STEPS_COLUMNS)
device_id = steps_data["device_id"].iloc[0]
records = []
# Parse JSON into individual records
for record in steps_data.fitbit_data:
record = json.loads(record) # Parse text into JSON
curr_date = datetime.strptime(
record["activities-steps"][0]["dateTime"], "%Y-%m-%d")
dataset = record["activities-steps-intraday"]["dataset"]
for data in dataset:
d_time = datetime.strptime(data["time"], '%H:%M:%S').time()
d_datetime = datetime.combine(curr_date, d_time)
row = (device_id,
data["value"],
d_datetime,
d_datetime.date(),
d_datetime.month,
d_datetime.day,
d_datetime.weekday(),
d_datetime.time(),
d_datetime.hour,
d_datetime.minute,
HOUR2EPOCH[d_datetime.hour])
records.append(row)
return pd.DataFrame(data=records, columns=STEPS_COLUMNS)
def parse_sleep_data(sleep_data):
if sleep_data.empty:
return pd.DataFrame(columns=SLEEP_COLUMNS)
device_id = sleep_data["device_id"].iloc[0]
records = []
# Parse JSON into individual records
for multi_record in sleep_data.fitbit_data:
for record in json.loads(multi_record)["sleep"]:
# Compute date when sleep episodes span two days
start_date = datetime.strptime(record["startTime"][:10], "%Y-%m-%d")
end_date = datetime.strptime(record["endTime"][:10], "%Y-%m-%d")
flag = 1 if start_date == end_date else 0
for data in record["minuteData"]:
d_time = datetime.strptime(data["dateTime"], '%H:%M:%S').time()
if not flag and not d_time.hour:
flag = 1
curr_date = end_date if flag else start_date
d_datetime = datetime.combine(curr_date, d_time)
row = (device_id,
data["value"],
d_datetime,
d_datetime.date(),
d_datetime.month,
d_datetime.day,
d_datetime.weekday(),
d_datetime.time(),
d_datetime.hour,
d_datetime.minute,
HOUR2EPOCH[d_datetime.hour])
records.append(row)
return pd.DataFrame(data=records, columns=SLEEP_COLUMNS)
def parse_heartrate_data(heartrate_data):
if heartrate_data.empty:
return pd.DataFrame(columns=HR_COLUMNS)
device_id = heartrate_data["device_id"].iloc[0]
records = []
# Get the range of heartrate zones: outofrange, fatburn, cardio, peak
# refer to: https://help.fitbit.com/articles/en_US/Help_article/1565
heartrate_fitbit_data = json.loads(heartrate_data["fitbit_data"].iloc[0])["activities-heart"][0]
if "heartRateZones" in heartrate_fitbit_data:
heartrate_zones = heartrate_fitbit_data["heartRateZones"]
elif "value" in heartrate_fitbit_data:
heartrate_zones = heartrate_fitbit_data["value"]["heartRateZones"]
else:
raise ValueError("Please check the format of fitbit heartrate raw data.")
heartrate_zones_range = {}
for hrzone in heartrate_zones:
heartrate_zones_range[hrzone["name"].lower().replace(" ", "")] = [hrzone["min"], hrzone["max"]]
# Parse JSON into individual records
for record in heartrate_data.fitbit_data:
record = json.loads(record) # Parse text into JSON
curr_date = datetime.strptime(record["activities-heart"][0]["dateTime"], "%Y-%m-%d")
dataset = record["activities-heart-intraday"]["dataset"]
for data in dataset:
d_time = datetime.strptime(data["time"], '%H:%M:%S').time()
d_datetime = datetime.combine(curr_date, d_time)
d_hr = data["value"]
# Get heartrate zone by range: min <= heartrate < max
d_hrzone = None
for hrzone, hrrange in heartrate_zones_range.items():
if d_hr >= hrrange[0] and d_hr < hrrange[1]:
d_hrzone = hrzone
break
row = (device_id,
d_hr,
d_hrzone,
d_datetime,
d_datetime.date(),
d_datetime.month,
d_datetime.day,
d_datetime.weekday(),
d_datetime.time(),
d_datetime.hour,
d_datetime.minute,
HOUR2EPOCH[d_datetime.hour])
records.append(row)
return pd.DataFrame(data=records, columns=HR_COLUMNS)
def parse_calories_data(calories_data):
if calories_data.empty:
return pd.DataFrame(columns=CALORIES_COLUMNS)
device_id = calories_data["device_id"].iloc[0]
records = []
# Parse JSON into individual records
for record in calories_data.fitbit_data:
record = json.loads(record) # Parse text into JSON
curr_date = datetime.strptime(
record["activities-calories"][0]["dateTime"], "%Y-%m-%d")
dataset = record["activities-calories-intraday"]["dataset"]
for data in dataset:
d_time = datetime.strptime(data["time"], '%H:%M:%S').time()
d_datetime = datetime.combine(curr_date, d_time)
row = (device_id,
data["level"],
data["mets"],
data["value"],
d_datetime,
d_datetime.date(),
d_datetime.month,
d_datetime.day,
d_datetime.weekday(),
d_datetime.time(),
d_datetime.hour,
d_datetime.minute,
HOUR2EPOCH[d_datetime.hour])
records.append(row)
return pd.DataFrame(data=records, columns=CALORIES_COLUMNS)
fitbit_data = pd.read_csv(snakemake.input[0])
@ -211,14 +37,16 @@ data = fitbit_data[fitbit_data["fitbit_data_type"] == sensor]
data = drop_duplicates(data, local_timezone)
if sensor == "heartrate":
data_preprocesed = parse_heartrate_data(data)
summary_data, intraday_data = parseHeartrateData(data, HOUR2EPOCH)
elif sensor == "sleep":
data_preprocesed = parse_sleep_data(data)
summary_data, intraday_data = parseSleepData(data, HOUR2EPOCH)
elif sensor == "steps":
data_preprocesed = parse_steps_data(data)
summary_data, intraday_data = parseStepsData(data, HOUR2EPOCH)
elif sensor == "calories":
data_preprocesed = parse_calories_data(data)
summary_data, intraday_data = parseCaloriesData(data, HOUR2EPOCH)
else:
raise ValueError("Please check the FITBIT_SENSORS list in config.yaml file.")
data_preprocesed.to_csv(snakemake.output[0], index=False)
# Summary data will be empty for steps and calories as it is not provided by Fitbit's API
summary_data.to_csv(snakemake.output["summary_data"], index=False)
intraday_data.to_csv(snakemake.output["intraday_data"], index=False)

View File

@ -4,47 +4,75 @@ from scipy.stats import entropy
import json
heartrate_data = pd.read_csv(snakemake.input[0], parse_dates=["local_date_time", "local_date"])
def extractHRFeaturesFromSummaryData(heartrate_summary_data, daily_features_from_summary_data):
heartrate_summary_features = pd.DataFrame()
if "restinghr" in daily_features_from_summary_data:
heartrate_summary_features["heartrate_daily_restinghr"] = heartrate_summary_data["heartrate_daily_restinghr"]
# calories features might be inaccurate: they depend on users' fitbit profile (weight, height, etc.)
if "caloriesoutofrange" in daily_features_from_summary_data:
heartrate_summary_features["heartrate_daily_caloriesoutofrange"] = heartrate_summary_data["heartrate_daily_caloriesoutofrange"]
if "caloriesfatburn" in daily_features_from_summary_data:
heartrate_summary_features["heartrate_daily_caloriesfatburn"] = heartrate_summary_data["heartrate_daily_caloriesfatburn"]
if "caloriescardio" in daily_features_from_summary_data:
heartrate_summary_features["heartrate_daily_caloriescardio"] = heartrate_summary_data["heartrate_daily_caloriescardio"]
if "caloriespeak" in daily_features_from_summary_data:
heartrate_summary_features["heartrate_daily_caloriespeak"] = heartrate_summary_data["heartrate_daily_caloriespeak"]
heartrate_summary_features.reset_index(inplace=True)
return heartrate_summary_features
def extractHRFeaturesFromIntradayData(heartrate_intraday_data, features):
heartrate_intraday_features = pd.DataFrame(columns=["local_date"] + ["heartrate_" + day_segment + "_" + x for x in features])
if not heartrate_intraday_data.empty:
device_id = heartrate_intraday_data["device_id"][0]
num_rows_per_minute = heartrate_intraday_data.groupby(["local_date", "local_hour", "local_minute"]).count().mean()["device_id"]
if day_segment != "daily":
heartrate_intraday_data = heartrate_intraday_data[heartrate_intraday_data["local_day_segment"] == day_segment]
if not heartrate_intraday_data.empty:
heartrate_intraday_features = pd.DataFrame()
# get stats of heartrate
if "maxhr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_maxhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].max()
if "minhr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_minhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].min()
if "avghr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_avghr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].mean()
if "medianhr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_medianhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].median()
if "modehr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_modehr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].agg(lambda x: pd.Series.mode(x)[0])
if "stdhr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_stdhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].std()
if "diffmaxmodehr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_diffmaxmodehr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].max() - heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].agg(lambda x: pd.Series.mode(x)[0])
if "diffminmodehr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_diffminmodehr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].agg(lambda x: pd.Series.mode(x)[0]) - heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].min()
if "entropyhr" in features:
heartrate_intraday_features["heartrate_" + day_segment + "_entropyhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].agg(entropy)
# get number of minutes in each heart rate zone
for feature_name in list(set(["lengthoutofrange", "lengthfatburn", "lengthcardio", "lengthpeak"]) & set(features)):
heartrate_zone = heartrate_intraday_data[heartrate_intraday_data["heartrate_zone"] == feature_name[6:]]
heartrate_intraday_features["heartrate_" + day_segment + "_" + feature_name] = heartrate_zone.groupby(["local_date"])["device_id"].count() / num_rows_per_minute
heartrate_intraday_features.fillna(value={"heartrate_" + day_segment + "_" + feature_name: 0}, inplace=True)
heartrate_intraday_features.reset_index(inplace=True)
return heartrate_intraday_features
heartrate_summary_data = pd.read_csv(snakemake.input["heartrate_summary_data"], index_col=["local_date"], parse_dates=["local_date"])
heartrate_intraday_data = pd.read_csv(snakemake.input["heartrate_intraday_data"], parse_dates=["local_date_time", "local_date"])
day_segment = snakemake.params["day_segment"]
features = snakemake.params["features"]
daily_features_from_summary_data = snakemake.params["daily_features_from_summary_data"]
heartrate_features = pd.DataFrame(columns=["local_date"] + ["heartrate_" + day_segment + "_" + x for x in features])
if not heartrate_data.empty:
device_id = heartrate_data["device_id"][0]
num_rows_per_minute = heartrate_data.groupby(["local_date", "local_hour", "local_minute"]).count().mean()["device_id"]
if day_segment != "daily":
heartrate_data =heartrate_data[heartrate_data["local_day_segment"] == day_segment]
if not heartrate_data.empty:
heartrate_features = pd.DataFrame()
# get stats of heartrate
if "maxhr" in features:
heartrate_features["heartrate_" + day_segment + "_maxhr"] = heartrate_data.groupby(["local_date"])["heartrate"].max()
if "minhr" in features:
heartrate_features["heartrate_" + day_segment + "_minhr"] = heartrate_data.groupby(["local_date"])["heartrate"].min()
if "avghr" in features:
heartrate_features["heartrate_" + day_segment + "_avghr"] = heartrate_data.groupby(["local_date"])["heartrate"].mean()
if "medianhr" in features:
heartrate_features["heartrate_" + day_segment + "_medianhr"] = heartrate_data.groupby(["local_date"])["heartrate"].median()
if "modehr" in features:
heartrate_features["heartrate_" + day_segment + "_modehr"] = heartrate_data.groupby(["local_date"])["heartrate"].agg(lambda x: pd.Series.mode(x)[0])
if "stdhr" in features:
heartrate_features["heartrate_" + day_segment + "_stdhr"] = heartrate_data.groupby(["local_date"])["heartrate"].std()
if "diffmaxmodehr" in features:
heartrate_features["heartrate_" + day_segment + "_diffmaxmodehr"] = heartrate_data.groupby(["local_date"])["heartrate"].max() - heartrate_data.groupby(["local_date"])["heartrate"].agg(lambda x: pd.Series.mode(x)[0])
if "diffminmodehr" in features:
heartrate_features["heartrate_" + day_segment + "_diffminmodehr"] = heartrate_data.groupby(["local_date"])["heartrate"].agg(lambda x: pd.Series.mode(x)[0]) - heartrate_data.groupby(["local_date"])["heartrate"].min()
if "entropyhr" in features:
heartrate_features["heartrate_" + day_segment + "_entropyhr"] = heartrate_data.groupby(["local_date"])["heartrate"].agg(entropy)
# get number of minutes in each heart rate zone
for feature_name in list(set(["lengthoutofrange", "lengthfatburn", "lengthcardio", "lengthpeak"]) & set(features)):
heartrate_zone = heartrate_data[heartrate_data["heartrate_zone"] == feature_name[6:]]
heartrate_features["heartrate_" + day_segment + "_" + feature_name] = heartrate_zone.groupby(["local_date"])["device_id"].count() / num_rows_per_minute
heartrate_features.fillna(value={"heartrate_" + day_segment + "_" + feature_name: 0}, inplace=True)
heartrate_features = heartrate_features.reset_index()
heartrate_intraday_features = extractHRFeaturesFromIntradayData(heartrate_intraday_data, features)
if not heartrate_summary_data.empty and day_segment == "daily" and daily_features_from_summary_data != []:
heartrate_summary_features = extractHRFeaturesFromSummaryData(heartrate_summary_data, daily_features_from_summary_data)
heartrate_features = heartrate_intraday_features.merge(heartrate_summary_features, on=["local_date"], how="outer")
else:
heartrate_features = heartrate_intraday_features
heartrate_features.to_csv(snakemake.output[0], index=False)

View File

@ -0,0 +1,67 @@
import pandas as pd
import itertools
def dailyFeaturesFromSummaryData(sleep_summary_data, sleep_type):
if sleep_type == "main":
sleep_summary_data = sleep_summary_data[sleep_summary_data["is_main_sleep"] == 1]
elif sleep_type == "nap":
sleep_summary_data = sleep_summary_data[sleep_summary_data["is_main_sleep"] == 0]
elif sleep_type == "all":
pass
else:
raise ValueError("sleep_type can only be one of ['main', 'nap', 'all'].")
features_sum = sleep_summary_data[["minutes_after_wakeup", "minutes_asleep", "minutes_awake", "minutes_to_fall_asleep", "minutes_in_bed", "local_end_date"]].groupby(["local_end_date"]).sum()
features_sum.index.rename("local_date", inplace=True)
if "sumdurationafterwakeup" in daily_features_from_summary_data:
sleep_daily_features["sleep_daily_sumdurationafterwakeup" + sleep_type] = features_sum["minutes_after_wakeup"]
if "sumdurationasleep" in daily_features_from_summary_data:
sleep_daily_features["sleep_daily_sumdurationasleep" + sleep_type] = features_sum["minutes_asleep"]
if "sumdurationawake" in daily_features_from_summary_data:
sleep_daily_features["sleep_daily_sumdurationawake" + sleep_type] = features_sum["minutes_awake"]
if "sumdurationtofallasleep" in daily_features_from_summary_data:
sleep_daily_features["sleep_daily_sumdurationtofallasleep" + sleep_type] = features_sum["minutes_to_fall_asleep"]
if "sumdurationinbed" in daily_features_from_summary_data:
sleep_daily_features["sleep_daily_sumdurationinbed" + sleep_type] = features_sum["minutes_in_bed"]
features_avg = sleep_summary_data[["efficiency", "local_end_date"]].groupby(["local_end_date"]).mean()
features_avg.index.rename("local_date", inplace=True)
if "avgefficiency" in daily_features_from_summary_data:
sleep_daily_features["sleep_daily_avgefficiency" + sleep_type] = features_avg["efficiency"]
features_count = sleep_summary_data[["local_start_date_time", "local_end_date"]].groupby(["local_end_date"]).count()
features_count.index.rename("local_date", inplace=True)
if "countepisode" in daily_features_from_summary_data:
sleep_daily_features["sleep_daily_count" + sleep_type] = features_count["local_start_date_time"]
return sleep_daily_features
sleep_summary_data = pd.read_csv(snakemake.input["sleep_summary_data"])
sleep_types = snakemake.params["sleep_types"]
daily_features_from_summary_data = snakemake.params["daily_features_from_summary_data"]
day_segment = snakemake.params["day_segment"]
daily_features_can_be_zero = list(set(daily_features_from_summary_data) - set(["avgefficiency"]))
colnames_can_be_zero = ["sleep_daily_" + x for x in ["".join(feature) for feature in itertools.product(daily_features_can_be_zero, sleep_types)]]
colnames = ["sleep_daily_" + x for x in ["".join(feature) for feature in itertools.product(daily_features_from_summary_data, sleep_types)]]
if sleep_summary_data.empty:
sleep_daily_features = pd.DataFrame(columns=["local_date"] + colnames)
else:
sleep_daily_features = pd.DataFrame(columns=colnames)
for sleep_type in sleep_types:
sleep_daily_features = dailyFeaturesFromSummaryData(sleep_summary_data, sleep_type)
sleep_daily_features[colnames_can_be_zero] = sleep_daily_features[colnames_can_be_zero].fillna(0)
if day_segment == "daily":
sleep_daily_features.to_csv(snakemake.output[0])
else:
ValueError("Sleep summary features are only implemented for daily day segments")