diff --git a/Snakefile b/Snakefile index bc1c849a..22307b61 100644 --- a/Snakefile +++ b/Snakefile @@ -59,15 +59,19 @@ rule all: expand("data/processed/{pid}/applications_foreground_{day_segment}.csv", pid = config["PIDS"], day_segment = config["APPLICATIONS_FOREGROUND"]["DAY_SEGMENTS"]), - expand("data/raw/{pid}/fitbit_{fitbit_sensor}_with_datetime.csv", + expand("data/raw/{pid}/fitbit_{fitbit_sensor}_{fitbit_data_type}_with_datetime.csv", pid=config["PIDS"], - fitbit_sensor=config["FITBIT_SENSORS"]), + fitbit_sensor=config["FITBIT_SENSORS"], + fitbit_data_type=config["FITBIT_DATA_TYPE"]), expand("data/processed/{pid}/fitbit_heartrate_{day_segment}.csv", pid = config["PIDS"], day_segment = config["HEARTRATE"]["DAY_SEGMENTS"]), expand("data/processed/{pid}/fitbit_step_{day_segment}.csv", pid = config["PIDS"], day_segment = config["STEP"]["DAY_SEGMENTS"]), + expand("data/processed/{pid}/fitbit_sleep_{day_segment}.csv", + pid = config["PIDS"], + day_segment = config["SLEEP"]["DAY_SEGMENTS"]), expand("data/processed/{pid}/wifi_{segment}.csv", pid=config["PIDS"], segment = config["WIFI"]["DAY_SEGMENTS"]), diff --git a/config.yaml b/config.yaml index f0aa6816..6074f26e 100644 --- a/config.yaml +++ b/config.yaml @@ -3,6 +3,7 @@ SENSORS: [applications_crashes, applications_foreground, applications_notificati FITBIT_TABLE: [fitbit_data] FITBIT_SENSORS: [heartrate, steps, sleep, calories] +FITBIT_DATA_TYPE: [summary, intraday] # Participants to include in the analysis # You must create a file for each participant @@ -114,6 +115,7 @@ APPLICATIONS_FOREGROUND: HEARTRATE: DAY_SEGMENTS: *day_segments FEATURES: ["maxhr", "minhr", "avghr", "medianhr", "modehr", "stdhr", "diffmaxmodehr", "diffminmodehr", "entropyhr", "lengthoutofrange", "lengthfatburn", "lengthcardio", "lengthpeak"] + DAILY_FEATURES_FROM_SUMMARY_DATA: ["restinghr"] # calories related features might be inaccurate: ["caloriesoutofrange", "caloriesfatburn", "caloriescardio", "caloriespeak"] STEP: DAY_SEGMENTS: *day_segments @@ -124,6 +126,11 @@ STEP: THRESHOLD_ACTIVE_BOUT: 10 # steps INCLUDE_ZERO_STEP_ROWS: True +SLEEP: + DAY_SEGMENTS: *day_segments + SLEEP_TYPES: ["main", "nap", "all"] + DAILY_FEATURES_FROM_SUMMARY_DATA: ["sumdurationafterwakeup", "sumdurationasleep", "sumdurationawake", "sumdurationtofallasleep", "sumdurationinbed", "avgefficiency", "countepisode"] + WIFI: DAY_SEGMENTS: *day_segments FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] @@ -133,7 +140,7 @@ PARAMS_FOR_ANALYSIS: SOURCES: &sources ["phone_features", "fitbit_features", "phone_fitbit_features"] DAY_SEGMENTS: *day_segments PHONE_FEATURES: [accelerometer, applications_foreground, battery, call_incoming, call_missed, call_outgoing, activity_recognition, light, location_barnett, screen, sms_received, sms_sent] - FITBIT_FEATURES: [fitbit_heartrate, fitbit_step] + FITBIT_FEATURES: [fitbit_heartrate, fitbit_step, fitbit_sleep] PHONE_FITBIT_FEATURES: "" # This array is merged in the input_merge_features_of_single_participant function in models.snakefile DEMOGRAPHIC_FEATURES: [age, gender, inpatientdays] CATEGORICAL_DEMOGRAPHIC_FEATURES: ["gender"] diff --git a/rules/features.snakefile b/rules/features.snakefile index 6bc5b694..df8fbe89 100644 --- a/rules/features.snakefile +++ b/rules/features.snakefile @@ -167,12 +167,25 @@ rule applications_foreground_features: script: "../src/features/applications_foreground_features.py" +rule wifi_features: + input: + "data/raw/{pid}/wifi_with_datetime.csv" + params: + day_segment = "{day_segment}", + features = config["WIFI"]["FEATURES"] + output: + "data/processed/{pid}/wifi_{day_segment}.csv" + script: + "../src/features/wifi_features.R" + rule fitbit_heartrate_features: input: - "data/raw/{pid}/fitbit_heartrate_with_datetime.csv", + heartrate_summary_data = "data/raw/{pid}/fitbit_heartrate_summary_with_datetime.csv", + heartrate_intraday_data = "data/raw/{pid}/fitbit_heartrate_intraday_with_datetime.csv" params: day_segment = "{day_segment}", features = config["HEARTRATE"]["FEATURES"], + daily_features_from_summary_data = config["HEARTRATE"]["DAILY_FEATURES_FROM_SUMMARY_DATA"] output: "data/processed/{pid}/fitbit_heartrate_{day_segment}.csv" script: @@ -193,13 +206,15 @@ rule fitbit_step_features: script: "../src/features/fitbit_step_features.py" -rule wifi_features: - input: - "data/raw/{pid}/wifi_with_datetime.csv" +rule fitbit_sleep_features: + input: + sleep_summary_data = "data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv", + sleep_intraday_data = "data/raw/{pid}/fitbit_sleep_intraday_with_datetime.csv" params: day_segment = "{day_segment}", - features = config["WIFI"]["FEATURES"] + sleep_types = config["SLEEP"]["SLEEP_TYPES"], + daily_features_from_summary_data = config["SLEEP"]["DAILY_FEATURES_FROM_SUMMARY_DATA"] output: - "data/processed/{pid}/wifi_{day_segment}.csv" + "data/processed/{pid}/fitbit_sleep_{day_segment}.csv" script: - "../src/features/wifi_features.R" \ No newline at end of file + "../src/features/fitbit_sleep_features.py" diff --git a/rules/preprocessing.snakefile b/rules/preprocessing.snakefile index 2020861e..34291ea1 100644 --- a/rules/preprocessing.snakefile +++ b/rules/preprocessing.snakefile @@ -99,7 +99,8 @@ rule fitbit_with_datetime: local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"], fitbit_sensor = "{fitbit_sensor}" output: - "data/raw/{pid}/fitbit_{fitbit_sensor}_with_datetime.csv" + summary_data = "data/raw/{pid}/fitbit_{fitbit_sensor}_summary_with_datetime.csv", + intraday_data = "data/raw/{pid}/fitbit_{fitbit_sensor}_intraday_with_datetime.csv" script: "../src/data/fitbit_readable_datetime.py" diff --git a/src/data/fitbit_parse_sensors/fitbit_parse_calories.py b/src/data/fitbit_parse_sensors/fitbit_parse_calories.py new file mode 100644 index 00000000..08a0ed53 --- /dev/null +++ b/src/data/fitbit_parse_sensors/fitbit_parse_calories.py @@ -0,0 +1,35 @@ +import json +import pandas as pd +from datetime import datetime + + +CALORIES_INTRADAY_COLUMNS = ("device_id", + "level", "mets", "value", + "local_date_time", "local_date", "local_month", "local_day", + "local_day_of_week", "local_time", "local_hour", "local_minute", + "local_day_segment") + +def parseCaloriesData(calories_data, HOUR2EPOCH): + if calories_data.empty: + return pd.DataFrame(), pd.DataFrame(columns=CALORIES_INTRADAY_COLUMNS) + device_id = calories_data["device_id"].iloc[0] + records_intraday = [] + # Parse JSON into individual records + for record in calories_data.fitbit_data: + record = json.loads(record) # Parse text into JSON + curr_date = datetime.strptime( + record["activities-calories"][0]["dateTime"], "%Y-%m-%d") + dataset = record["activities-calories-intraday"]["dataset"] + for data in dataset: + d_time = datetime.strptime(data["time"], '%H:%M:%S').time() + d_datetime = datetime.combine(curr_date, d_time) + + row_intraday = (device_id, + data["level"], data["mets"], data["value"], + d_datetime, d_datetime.date(), d_datetime.month, d_datetime.day, + d_datetime.weekday(), d_datetime.time(), d_datetime.hour, d_datetime.minute, + HOUR2EPOCH[d_datetime.hour]) + + records_intraday.append(row_intraday) + + return pd.DataFrame(), pd.DataFrame(data=records_intraday, columns=CALORIES_INTRADAY_COLUMNS) diff --git a/src/data/fitbit_parse_sensors/fitbit_parse_heartrate.py b/src/data/fitbit_parse_sensors/fitbit_parse_heartrate.py new file mode 100644 index 00000000..92719fd7 --- /dev/null +++ b/src/data/fitbit_parse_sensors/fitbit_parse_heartrate.py @@ -0,0 +1,114 @@ +import json +import pandas as pd +from datetime import datetime + + +HR_SUMMARY_COLUMNS = ("device_id", + "local_date", + "heartrate_daily_restinghr", + "heartrate_daily_caloriesoutofrange", + "heartrate_daily_caloriesfatburn", + "heartrate_daily_caloriescardio", + "heartrate_daily_caloriespeak") + +HR_INTRADAY_COLUMNS = ("device_id", + "heartrate", "heartrate_zone", + "local_date_time", "local_date", "local_month", "local_day", + "local_day_of_week", "local_time", "local_hour", "local_minute", + "local_day_segment") + +def parseHeartrateZones(heartrate_data): + # Get the range of heartrate zones: outofrange, fatburn, cardio, peak + # refer to: https://help.fitbit.com/articles/en_US/Help_article/1565 + + heartrate_fitbit_data = json.loads(heartrate_data["fitbit_data"].iloc[0])["activities-heart"][0] + # API Version X: not sure the exact version + if "heartRateZones" in heartrate_fitbit_data: + heartrate_zones = heartrate_fitbit_data["heartRateZones"] + # API VERSION Y: not sure the exact version + elif "value" in heartrate_fitbit_data: + heartrate_zones = heartrate_fitbit_data["value"]["heartRateZones"] + else: + raise ValueError("Heartrate zone are stored in an unkown format, this could mean Fitbit's heartrate API changed") + + heartrate_zones_range = {} + for hrzone in heartrate_zones: + heartrate_zones_range[hrzone["name"].lower().replace(" ", "")] = [hrzone["min"], hrzone["max"]] + return heartrate_zones_range + +def parseHeartrateSummaryData(record_summary, device_id, curr_date): + # API Version X: not sure the exact version + if "heartRateZones" in record_summary: + heartrate_zones = record_summary["heartRateZones"] + d_resting_heartrate = record_summary["value"] if "value" in record_summary else None + # API VERSION Y: not sure the exact version + elif "value" in record_summary: + heartrate_zones = record_summary["value"]["heartRateZones"] + d_resting_heartrate = record_summary["value"]["restingHeartRate"] if "restingHeartRate" in record_summary["value"] else None + else: + ValueError("Heartrate zone are stored in an unkown format, this could mean Fitbit's heartrate API changed") + + if "caloriesOut" in heartrate_zones[0]: + d_calories_outofrange = heartrate_zones[0]["caloriesOut"] + d_calories_fatburn = heartrate_zones[1]["caloriesOut"] + d_calories_cardio = heartrate_zones[2]["caloriesOut"] + d_calories_peak = heartrate_zones[3]["caloriesOut"] + else: + d_calories_outofrange, d_calories_fatburn, d_calories_cardio, d_calories_peak = None, None, None, None + + row_summary = (device_id, + curr_date, + d_resting_heartrate, + d_calories_outofrange, + d_calories_fatburn, + d_calories_cardio, + d_calories_peak) + return row_summary + + + + +def parseHeartrateIntradayData(records_intraday, dataset, device_id, curr_date, heartrate_zones_range, HOUR2EPOCH): + for data in dataset: + d_time = datetime.strptime(data["time"], '%H:%M:%S').time() + d_datetime = datetime.combine(curr_date, d_time) + d_hr = data["value"] + + # Get heartrate zone by range: min <= heartrate < max + d_hrzone = None + for hrzone, hrrange in heartrate_zones_range.items(): + if d_hr >= hrrange[0] and d_hr < hrrange[1]: + d_hrzone = hrzone + break + + row_intraday = (device_id, + d_hr, d_hrzone, + d_datetime, d_datetime.date(), d_datetime.month, d_datetime.day, + d_datetime.weekday(), d_datetime.time(), d_datetime.hour, d_datetime.minute, + HOUR2EPOCH[d_datetime.hour]) + + records_intraday.append(row_intraday) + return records_intraday + + +def parseHeartrateData(heartrate_data, HOUR2EPOCH): + if heartrate_data.empty: + return pd.DataFrame(columns=HR_COLUMNS) + device_id = heartrate_data["device_id"].iloc[0] + records_summary, records_intraday = [], [] + + heartrate_zones_range = parseHeartrateZones(heartrate_data) + + # Parse JSON into individual records + for record in heartrate_data.fitbit_data: + record = json.loads(record) # Parse text into JSON + curr_date = datetime.strptime(record["activities-heart"][0]["dateTime"], "%Y-%m-%d") + + record_summary = record["activities-heart"][0] + row_summary = parseHeartrateSummaryData(record_summary, device_id, curr_date) + records_summary.append(row_summary) + + dataset = record["activities-heart-intraday"]["dataset"] + records_intraday = parseHeartrateIntradayData(records_intraday, dataset, device_id, curr_date, heartrate_zones_range, HOUR2EPOCH) + + return pd.DataFrame(data=records_summary, columns=HR_SUMMARY_COLUMNS), pd.DataFrame(data=records_intraday, columns=HR_INTRADAY_COLUMNS) diff --git a/src/data/fitbit_parse_sensors/fitbit_parse_sleep.py b/src/data/fitbit_parse_sensors/fitbit_parse_sleep.py new file mode 100644 index 00000000..9ec8ea74 --- /dev/null +++ b/src/data/fitbit_parse_sensors/fitbit_parse_sleep.py @@ -0,0 +1,109 @@ +import json +import pandas as pd +from datetime import datetime + + +SLEEP_CODE2LEVEL = ["asleep", "restless", "awake"] + + +SLEEP_SUMMARY_COLUMNS_V1_2 = ("device_id", "efficiency", + "minutes_after_wakeup", "minutes_asleep", "minutes_awake", "minutes_to_fall_asleep", "minutes_in_bed", + "is_main_sleep", "type", + "local_start_date_time", "local_end_date_time", + "local_start_date", "local_end_date", + "local_start_day_segment", "local_end_day_segment") +SLEEP_SUMMARY_COLUMNS_V1 = SLEEP_SUMMARY_COLUMNS_V1_2 + ("count_awake", "duration_awake", "count_awakenings", "count_restless", "duration_restless") + +SLEEP_INTRADAY_COLUMNS = ("device_id", + # For "classic" type, original_level is one of {"awake", "restless", "asleep"} + # For "stages" type, original_level is one of {"wake", "deep", "light", "rem"} + "original_level", + # For "classic" type, unified_level is one of {0, 1} where 0: awake {"awake" + "restless"}, 1: asleep {"asleep"} + # For "stages" type, unified_level is one of {0, 1} where 0: awake {"wake"}, 1: asleep {"deep" + "light" + "rem"} + "unified_level", + # one of {0, 1} where 0: nap, 1: main sleep + "is_main_sleep", + # one of {"classic", "stages"} + "type", + "local_date_time", "local_date", "local_month", "local_day", + "local_day_of_week", "local_time", "local_hour", "local_minute", + "local_day_segment") + +# Parse one record for sleep API version 1 +def parseOneRecordForV1(record, device_id, d_is_main_sleep, records_summary, records_intraday, HOUR2EPOCH): + + # Summary data + sleep_record_type = "classic" + + d_start_datetime = datetime.strptime(record["startTime"][:18], "%Y-%m-%dT%H:%M:%S") + d_end_datetime = datetime.strptime(record["endTime"][:18], "%Y-%m-%dT%H:%M:%S") + + row_summary = (device_id, record["efficiency"], + record["minutesAfterWakeup"], record["minutesAsleep"], record["minutesAwake"], record["minutesToFallAsleep"], record["timeInBed"], + d_is_main_sleep, sleep_record_type, + d_start_datetime, d_end_datetime, + d_start_datetime.date(), d_end_datetime.date(), + HOUR2EPOCH[d_start_datetime.hour], HOUR2EPOCH[d_end_datetime.hour], + record["awakeCount"], record["awakeDuration"], record["awakeningsCount"], + record["restlessCount"], record["restlessDuration"]) + + records_summary.append(row_summary) + + # Intraday data + start_date = d_start_datetime.date() + end_date = d_end_datetime.date() + is_before_midnight = True + curr_date = start_date + for data in record["minuteData"]: + # For overnight episodes, use end_date once we are over midnight + d_time = datetime.strptime(data["dateTime"], '%H:%M:%S').time() + if is_before_midnight and d_time.hour == 0: + curr_date = end_date + d_datetime = datetime.combine(curr_date, d_time) + + # API 1.2 stores original_level as strings, so we convert original_levels of API 1 to strings too + # (1: "asleep", 2: "restless", 3: "awake") + d_original_level = SLEEP_CODE2LEVEL[int(data["value"])-1] + + # unified_level summarises original_level (we came up with this classification) + # 0 is awake, 1 is asleep + # {"awake" + "restless"} are set to 0 and {"asleep"} is set to 1 + d_unified_level = 0 if d_original_level == "awake" or d_original_level == "restless" else 1 + + row_intraday = (device_id, + d_original_level, d_unified_level, d_is_main_sleep, sleep_record_type, + d_datetime, d_datetime.date(), d_datetime.month, d_datetime.day, + d_datetime.weekday(), d_datetime.time(), d_datetime.hour, d_datetime.minute, + HOUR2EPOCH[d_datetime.hour]) + + records_intraday.append(row_intraday) + + return records_summary, records_intraday + +# Parse one record for sleep API version 1.2 +def parseOneRecordForV12(record, d_is_main_sleep, records_summary, records_intraday): + return None + + + +def parseSleepData(sleep_data, HOUR2EPOCH): + if sleep_data.empty: + return pd.DataFrame(columns=SLEEP_SUMMARY_COLUMNS_V1), pd.DataFrame(columns=SLEEP_INTRADAY_COLUMNS) + device_id = sleep_data["device_id"].iloc[0] + records_summary, records_intraday = [], [] + # Parse JSON into individual records + for multi_record in sleep_data.fitbit_data: + for record in json.loads(multi_record)["sleep"]: + # Whether the sleep episode is nap (0) or main sleep (1) + d_is_main_sleep = 1 if record["isMainSleep"] else 0 + + # For sleep API version 1 + if "awakeCount" in record: + SLEEP_SUMMARY_COLUMNS = SLEEP_SUMMARY_COLUMNS_V1 + records_summary, records_intraday = parseOneRecordForV1(record, device_id, d_is_main_sleep, records_summary, records_intraday, HOUR2EPOCH) + # For sleep API version 1.2 + else: + SLEEP_SUMMARY_COLUMNS = SLEEP_SUMMARY_COLUMNS_V1_2 + raise ValueError("Sleep data for API v1.2 is not supported yet.") + + return pd.DataFrame(data=records_summary, columns=SLEEP_SUMMARY_COLUMNS), pd.DataFrame(data=records_intraday, columns=SLEEP_INTRADAY_COLUMNS) diff --git a/src/data/fitbit_parse_sensors/fitbit_parse_steps.py b/src/data/fitbit_parse_sensors/fitbit_parse_steps.py new file mode 100644 index 00000000..f031b408 --- /dev/null +++ b/src/data/fitbit_parse_sensors/fitbit_parse_steps.py @@ -0,0 +1,35 @@ +import json +import pandas as pd +from datetime import datetime + +STEPS_INTRADAY_COLUMNS = ("device_id", + "steps", + "local_date_time", "local_date", "local_month", "local_day", + "local_day_of_week", "local_time", "local_hour", "local_minute", + "local_day_segment") + + +def parseStepsData(steps_data, HOUR2EPOCH): + if steps_data.empty: + return pd.DataFrame(), pd.DataFrame(columns=STEPS_COLUMNS) + device_id = steps_data["device_id"].iloc[0] + records_intraday = [] + # Parse JSON into individual records + for record in steps_data.fitbit_data: + record = json.loads(record) # Parse text into JSON + curr_date = datetime.strptime( + record["activities-steps"][0]["dateTime"], "%Y-%m-%d") + dataset = record["activities-steps-intraday"]["dataset"] + for data in dataset: + d_time = datetime.strptime(data["time"], '%H:%M:%S').time() + d_datetime = datetime.combine(curr_date, d_time) + + row_intraday = (device_id, + data["value"], + d_datetime, d_datetime.date(), d_datetime.month, d_datetime.day, + d_datetime.weekday(), d_datetime.time(), d_datetime.hour, d_datetime.minute, + HOUR2EPOCH[d_datetime.hour]) + + records_intraday.append(row_intraday) + + return pd.DataFrame(), pd.DataFrame(data=records_intraday, columns=STEPS_INTRADAY_COLUMNS) diff --git a/src/data/fitbit_readable_datetime.py b/src/data/fitbit_readable_datetime.py index cf40eade..2383d385 100644 --- a/src/data/fitbit_readable_datetime.py +++ b/src/data/fitbit_readable_datetime.py @@ -1,6 +1,10 @@ import pandas as pd import pytz, json from datetime import datetime +from fitbit_parse_sensors.fitbit_parse_heartrate import parseHeartrateData +from fitbit_parse_sensors.fitbit_parse_sleep import parseSleepData +from fitbit_parse_sensors.fitbit_parse_steps import parseStepsData +from fitbit_parse_sensors.fitbit_parse_calories import parseCaloriesData NIGHT = "night" @@ -10,30 +14,6 @@ EVENING = "evening" HOUR2EPOCH = [NIGHT] * 6 + [MORNING] * 6 + [AFTERNOON] * 6 + [EVENING] * 6 -HR_COLUMNS = ("device_id", - "heartrate", "heartrate_zone", - "local_date_time", "local_date", "local_month", "local_day", - "local_day_of_week", "local_time", "local_hour", "local_minute", - "local_day_segment") - -SLEEP_COLUMNS = ("device_id", - "sleep", # 1: "asleep", 2: "restless", or 3: "awake" - "local_date_time", "local_date", "local_month", "local_day", - "local_day_of_week", "local_time", "local_hour", "local_minute", - "local_day_segment") - -STEPS_COLUMNS = ("device_id", - "steps", - "local_date_time", "local_date", "local_month", "local_day", - "local_day_of_week", "local_time", "local_hour", "local_minute", - "local_day_segment") - -CALORIES_COLUMNS = ("device_id", - "level", "mets", "value", - "local_date_time", "local_date", "local_month", "local_day", - "local_day_of_week", "local_time", "local_hour", "local_minute", - "local_day_segment") - def drop_duplicates(data, local_timezone): """ Data is pulled in intraday manner. Since data will be duplicated until the @@ -47,160 +27,6 @@ def drop_duplicates(data, local_timezone): return data -def parse_steps_data(steps_data): - if steps_data.empty: - return pd.DataFrame(columns=STEPS_COLUMNS) - device_id = steps_data["device_id"].iloc[0] - records = [] - # Parse JSON into individual records - for record in steps_data.fitbit_data: - record = json.loads(record) # Parse text into JSON - curr_date = datetime.strptime( - record["activities-steps"][0]["dateTime"], "%Y-%m-%d") - dataset = record["activities-steps-intraday"]["dataset"] - for data in dataset: - d_time = datetime.strptime(data["time"], '%H:%M:%S').time() - d_datetime = datetime.combine(curr_date, d_time) - - row = (device_id, - data["value"], - d_datetime, - d_datetime.date(), - d_datetime.month, - d_datetime.day, - d_datetime.weekday(), - d_datetime.time(), - d_datetime.hour, - d_datetime.minute, - HOUR2EPOCH[d_datetime.hour]) - - records.append(row) - - return pd.DataFrame(data=records, columns=STEPS_COLUMNS) - -def parse_sleep_data(sleep_data): - if sleep_data.empty: - return pd.DataFrame(columns=SLEEP_COLUMNS) - device_id = sleep_data["device_id"].iloc[0] - records = [] - # Parse JSON into individual records - for multi_record in sleep_data.fitbit_data: - for record in json.loads(multi_record)["sleep"]: - - # Compute date when sleep episodes span two days - start_date = datetime.strptime(record["startTime"][:10], "%Y-%m-%d") - end_date = datetime.strptime(record["endTime"][:10], "%Y-%m-%d") - flag = 1 if start_date == end_date else 0 - for data in record["minuteData"]: - d_time = datetime.strptime(data["dateTime"], '%H:%M:%S').time() - if not flag and not d_time.hour: - flag = 1 - curr_date = end_date if flag else start_date - d_datetime = datetime.combine(curr_date, d_time) - - row = (device_id, - data["value"], - d_datetime, - d_datetime.date(), - d_datetime.month, - d_datetime.day, - d_datetime.weekday(), - d_datetime.time(), - d_datetime.hour, - d_datetime.minute, - HOUR2EPOCH[d_datetime.hour]) - - records.append(row) - - return pd.DataFrame(data=records, columns=SLEEP_COLUMNS) - -def parse_heartrate_data(heartrate_data): - if heartrate_data.empty: - return pd.DataFrame(columns=HR_COLUMNS) - device_id = heartrate_data["device_id"].iloc[0] - records = [] - - # Get the range of heartrate zones: outofrange, fatburn, cardio, peak - # refer to: https://help.fitbit.com/articles/en_US/Help_article/1565 - - heartrate_fitbit_data = json.loads(heartrate_data["fitbit_data"].iloc[0])["activities-heart"][0] - if "heartRateZones" in heartrate_fitbit_data: - heartrate_zones = heartrate_fitbit_data["heartRateZones"] - elif "value" in heartrate_fitbit_data: - heartrate_zones = heartrate_fitbit_data["value"]["heartRateZones"] - else: - raise ValueError("Please check the format of fitbit heartrate raw data.") - - heartrate_zones_range = {} - for hrzone in heartrate_zones: - heartrate_zones_range[hrzone["name"].lower().replace(" ", "")] = [hrzone["min"], hrzone["max"]] - - # Parse JSON into individual records - for record in heartrate_data.fitbit_data: - record = json.loads(record) # Parse text into JSON - curr_date = datetime.strptime(record["activities-heart"][0]["dateTime"], "%Y-%m-%d") - dataset = record["activities-heart-intraday"]["dataset"] - for data in dataset: - d_time = datetime.strptime(data["time"], '%H:%M:%S').time() - d_datetime = datetime.combine(curr_date, d_time) - d_hr = data["value"] - - # Get heartrate zone by range: min <= heartrate < max - d_hrzone = None - for hrzone, hrrange in heartrate_zones_range.items(): - if d_hr >= hrrange[0] and d_hr < hrrange[1]: - d_hrzone = hrzone - break - - row = (device_id, - d_hr, - d_hrzone, - d_datetime, - d_datetime.date(), - d_datetime.month, - d_datetime.day, - d_datetime.weekday(), - d_datetime.time(), - d_datetime.hour, - d_datetime.minute, - HOUR2EPOCH[d_datetime.hour]) - - records.append(row) - - return pd.DataFrame(data=records, columns=HR_COLUMNS) - -def parse_calories_data(calories_data): - if calories_data.empty: - return pd.DataFrame(columns=CALORIES_COLUMNS) - device_id = calories_data["device_id"].iloc[0] - records = [] - # Parse JSON into individual records - for record in calories_data.fitbit_data: - record = json.loads(record) # Parse text into JSON - curr_date = datetime.strptime( - record["activities-calories"][0]["dateTime"], "%Y-%m-%d") - dataset = record["activities-calories-intraday"]["dataset"] - for data in dataset: - d_time = datetime.strptime(data["time"], '%H:%M:%S').time() - d_datetime = datetime.combine(curr_date, d_time) - - row = (device_id, - data["level"], - data["mets"], - data["value"], - d_datetime, - d_datetime.date(), - d_datetime.month, - d_datetime.day, - d_datetime.weekday(), - d_datetime.time(), - d_datetime.hour, - d_datetime.minute, - HOUR2EPOCH[d_datetime.hour]) - - records.append(row) - - return pd.DataFrame(data=records, columns=CALORIES_COLUMNS) fitbit_data = pd.read_csv(snakemake.input[0]) @@ -211,14 +37,16 @@ data = fitbit_data[fitbit_data["fitbit_data_type"] == sensor] data = drop_duplicates(data, local_timezone) if sensor == "heartrate": - data_preprocesed = parse_heartrate_data(data) + summary_data, intraday_data = parseHeartrateData(data, HOUR2EPOCH) elif sensor == "sleep": - data_preprocesed = parse_sleep_data(data) + summary_data, intraday_data = parseSleepData(data, HOUR2EPOCH) elif sensor == "steps": - data_preprocesed = parse_steps_data(data) + summary_data, intraday_data = parseStepsData(data, HOUR2EPOCH) elif sensor == "calories": - data_preprocesed = parse_calories_data(data) + summary_data, intraday_data = parseCaloriesData(data, HOUR2EPOCH) else: raise ValueError("Please check the FITBIT_SENSORS list in config.yaml file.") -data_preprocesed.to_csv(snakemake.output[0], index=False) +# Summary data will be empty for steps and calories as it is not provided by Fitbit's API +summary_data.to_csv(snakemake.output["summary_data"], index=False) +intraday_data.to_csv(snakemake.output["intraday_data"], index=False) diff --git a/src/features/fitbit_heartrate_features.py b/src/features/fitbit_heartrate_features.py index 652bae6d..199f4ea1 100644 --- a/src/features/fitbit_heartrate_features.py +++ b/src/features/fitbit_heartrate_features.py @@ -4,47 +4,75 @@ from scipy.stats import entropy import json -heartrate_data = pd.read_csv(snakemake.input[0], parse_dates=["local_date_time", "local_date"]) +def extractHRFeaturesFromSummaryData(heartrate_summary_data, daily_features_from_summary_data): + heartrate_summary_features = pd.DataFrame() + if "restinghr" in daily_features_from_summary_data: + heartrate_summary_features["heartrate_daily_restinghr"] = heartrate_summary_data["heartrate_daily_restinghr"] + # calories features might be inaccurate: they depend on users' fitbit profile (weight, height, etc.) + if "caloriesoutofrange" in daily_features_from_summary_data: + heartrate_summary_features["heartrate_daily_caloriesoutofrange"] = heartrate_summary_data["heartrate_daily_caloriesoutofrange"] + if "caloriesfatburn" in daily_features_from_summary_data: + heartrate_summary_features["heartrate_daily_caloriesfatburn"] = heartrate_summary_data["heartrate_daily_caloriesfatburn"] + if "caloriescardio" in daily_features_from_summary_data: + heartrate_summary_features["heartrate_daily_caloriescardio"] = heartrate_summary_data["heartrate_daily_caloriescardio"] + if "caloriespeak" in daily_features_from_summary_data: + heartrate_summary_features["heartrate_daily_caloriespeak"] = heartrate_summary_data["heartrate_daily_caloriespeak"] + heartrate_summary_features.reset_index(inplace=True) + + return heartrate_summary_features + +def extractHRFeaturesFromIntradayData(heartrate_intraday_data, features): + heartrate_intraday_features = pd.DataFrame(columns=["local_date"] + ["heartrate_" + day_segment + "_" + x for x in features]) + if not heartrate_intraday_data.empty: + device_id = heartrate_intraday_data["device_id"][0] + num_rows_per_minute = heartrate_intraday_data.groupby(["local_date", "local_hour", "local_minute"]).count().mean()["device_id"] + if day_segment != "daily": + heartrate_intraday_data = heartrate_intraday_data[heartrate_intraday_data["local_day_segment"] == day_segment] + + if not heartrate_intraday_data.empty: + heartrate_intraday_features = pd.DataFrame() + + # get stats of heartrate + if "maxhr" in features: + heartrate_intraday_features["heartrate_" + day_segment + "_maxhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].max() + if "minhr" in features: + heartrate_intraday_features["heartrate_" + day_segment + "_minhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].min() + if "avghr" in features: + heartrate_intraday_features["heartrate_" + day_segment + "_avghr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].mean() + if "medianhr" in features: + heartrate_intraday_features["heartrate_" + day_segment + "_medianhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].median() + if "modehr" in features: + heartrate_intraday_features["heartrate_" + day_segment + "_modehr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].agg(lambda x: pd.Series.mode(x)[0]) + if "stdhr" in features: + heartrate_intraday_features["heartrate_" + day_segment + "_stdhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].std() + if "diffmaxmodehr" in features: + heartrate_intraday_features["heartrate_" + day_segment + "_diffmaxmodehr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].max() - heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].agg(lambda x: pd.Series.mode(x)[0]) + if "diffminmodehr" in features: + heartrate_intraday_features["heartrate_" + day_segment + "_diffminmodehr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].agg(lambda x: pd.Series.mode(x)[0]) - heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].min() + if "entropyhr" in features: + heartrate_intraday_features["heartrate_" + day_segment + "_entropyhr"] = heartrate_intraday_data[["local_date", "heartrate"]].groupby(["local_date"])["heartrate"].agg(entropy) + + # get number of minutes in each heart rate zone + for feature_name in list(set(["lengthoutofrange", "lengthfatburn", "lengthcardio", "lengthpeak"]) & set(features)): + heartrate_zone = heartrate_intraday_data[heartrate_intraday_data["heartrate_zone"] == feature_name[6:]] + heartrate_intraday_features["heartrate_" + day_segment + "_" + feature_name] = heartrate_zone.groupby(["local_date"])["device_id"].count() / num_rows_per_minute + heartrate_intraday_features.fillna(value={"heartrate_" + day_segment + "_" + feature_name: 0}, inplace=True) + heartrate_intraday_features.reset_index(inplace=True) + + return heartrate_intraday_features + + +heartrate_summary_data = pd.read_csv(snakemake.input["heartrate_summary_data"], index_col=["local_date"], parse_dates=["local_date"]) +heartrate_intraday_data = pd.read_csv(snakemake.input["heartrate_intraday_data"], parse_dates=["local_date_time", "local_date"]) day_segment = snakemake.params["day_segment"] features = snakemake.params["features"] +daily_features_from_summary_data = snakemake.params["daily_features_from_summary_data"] - -heartrate_features = pd.DataFrame(columns=["local_date"] + ["heartrate_" + day_segment + "_" + x for x in features]) -if not heartrate_data.empty: - device_id = heartrate_data["device_id"][0] - num_rows_per_minute = heartrate_data.groupby(["local_date", "local_hour", "local_minute"]).count().mean()["device_id"] - if day_segment != "daily": - heartrate_data =heartrate_data[heartrate_data["local_day_segment"] == day_segment] - - if not heartrate_data.empty: - heartrate_features = pd.DataFrame() - - # get stats of heartrate - if "maxhr" in features: - heartrate_features["heartrate_" + day_segment + "_maxhr"] = heartrate_data.groupby(["local_date"])["heartrate"].max() - if "minhr" in features: - heartrate_features["heartrate_" + day_segment + "_minhr"] = heartrate_data.groupby(["local_date"])["heartrate"].min() - if "avghr" in features: - heartrate_features["heartrate_" + day_segment + "_avghr"] = heartrate_data.groupby(["local_date"])["heartrate"].mean() - if "medianhr" in features: - heartrate_features["heartrate_" + day_segment + "_medianhr"] = heartrate_data.groupby(["local_date"])["heartrate"].median() - if "modehr" in features: - heartrate_features["heartrate_" + day_segment + "_modehr"] = heartrate_data.groupby(["local_date"])["heartrate"].agg(lambda x: pd.Series.mode(x)[0]) - if "stdhr" in features: - heartrate_features["heartrate_" + day_segment + "_stdhr"] = heartrate_data.groupby(["local_date"])["heartrate"].std() - if "diffmaxmodehr" in features: - heartrate_features["heartrate_" + day_segment + "_diffmaxmodehr"] = heartrate_data.groupby(["local_date"])["heartrate"].max() - heartrate_data.groupby(["local_date"])["heartrate"].agg(lambda x: pd.Series.mode(x)[0]) - if "diffminmodehr" in features: - heartrate_features["heartrate_" + day_segment + "_diffminmodehr"] = heartrate_data.groupby(["local_date"])["heartrate"].agg(lambda x: pd.Series.mode(x)[0]) - heartrate_data.groupby(["local_date"])["heartrate"].min() - if "entropyhr" in features: - heartrate_features["heartrate_" + day_segment + "_entropyhr"] = heartrate_data.groupby(["local_date"])["heartrate"].agg(entropy) - - # get number of minutes in each heart rate zone - for feature_name in list(set(["lengthoutofrange", "lengthfatburn", "lengthcardio", "lengthpeak"]) & set(features)): - heartrate_zone = heartrate_data[heartrate_data["heartrate_zone"] == feature_name[6:]] - heartrate_features["heartrate_" + day_segment + "_" + feature_name] = heartrate_zone.groupby(["local_date"])["device_id"].count() / num_rows_per_minute - heartrate_features.fillna(value={"heartrate_" + day_segment + "_" + feature_name: 0}, inplace=True) - - heartrate_features = heartrate_features.reset_index() +heartrate_intraday_features = extractHRFeaturesFromIntradayData(heartrate_intraday_data, features) +if not heartrate_summary_data.empty and day_segment == "daily" and daily_features_from_summary_data != []: + heartrate_summary_features = extractHRFeaturesFromSummaryData(heartrate_summary_data, daily_features_from_summary_data) + heartrate_features = heartrate_intraday_features.merge(heartrate_summary_features, on=["local_date"], how="outer") +else: + heartrate_features = heartrate_intraday_features heartrate_features.to_csv(snakemake.output[0], index=False) diff --git a/src/features/fitbit_sleep_features.py b/src/features/fitbit_sleep_features.py new file mode 100644 index 00000000..9f4c6c7f --- /dev/null +++ b/src/features/fitbit_sleep_features.py @@ -0,0 +1,67 @@ +import pandas as pd +import itertools + + + +def dailyFeaturesFromSummaryData(sleep_summary_data, sleep_type): + if sleep_type == "main": + sleep_summary_data = sleep_summary_data[sleep_summary_data["is_main_sleep"] == 1] + elif sleep_type == "nap": + sleep_summary_data = sleep_summary_data[sleep_summary_data["is_main_sleep"] == 0] + elif sleep_type == "all": + pass + else: + raise ValueError("sleep_type can only be one of ['main', 'nap', 'all'].") + + features_sum = sleep_summary_data[["minutes_after_wakeup", "minutes_asleep", "minutes_awake", "minutes_to_fall_asleep", "minutes_in_bed", "local_end_date"]].groupby(["local_end_date"]).sum() + features_sum.index.rename("local_date", inplace=True) + if "sumdurationafterwakeup" in daily_features_from_summary_data: + sleep_daily_features["sleep_daily_sumdurationafterwakeup" + sleep_type] = features_sum["minutes_after_wakeup"] + if "sumdurationasleep" in daily_features_from_summary_data: + sleep_daily_features["sleep_daily_sumdurationasleep" + sleep_type] = features_sum["minutes_asleep"] + if "sumdurationawake" in daily_features_from_summary_data: + sleep_daily_features["sleep_daily_sumdurationawake" + sleep_type] = features_sum["minutes_awake"] + if "sumdurationtofallasleep" in daily_features_from_summary_data: + sleep_daily_features["sleep_daily_sumdurationtofallasleep" + sleep_type] = features_sum["minutes_to_fall_asleep"] + if "sumdurationinbed" in daily_features_from_summary_data: + sleep_daily_features["sleep_daily_sumdurationinbed" + sleep_type] = features_sum["minutes_in_bed"] + + features_avg = sleep_summary_data[["efficiency", "local_end_date"]].groupby(["local_end_date"]).mean() + features_avg.index.rename("local_date", inplace=True) + if "avgefficiency" in daily_features_from_summary_data: + sleep_daily_features["sleep_daily_avgefficiency" + sleep_type] = features_avg["efficiency"] + + features_count = sleep_summary_data[["local_start_date_time", "local_end_date"]].groupby(["local_end_date"]).count() + features_count.index.rename("local_date", inplace=True) + if "countepisode" in daily_features_from_summary_data: + sleep_daily_features["sleep_daily_count" + sleep_type] = features_count["local_start_date_time"] + + return sleep_daily_features + + + +sleep_summary_data = pd.read_csv(snakemake.input["sleep_summary_data"]) +sleep_types = snakemake.params["sleep_types"] +daily_features_from_summary_data = snakemake.params["daily_features_from_summary_data"] +day_segment = snakemake.params["day_segment"] + +daily_features_can_be_zero = list(set(daily_features_from_summary_data) - set(["avgefficiency"])) +colnames_can_be_zero = ["sleep_daily_" + x for x in ["".join(feature) for feature in itertools.product(daily_features_can_be_zero, sleep_types)]] + +colnames = ["sleep_daily_" + x for x in ["".join(feature) for feature in itertools.product(daily_features_from_summary_data, sleep_types)]] + +if sleep_summary_data.empty: + sleep_daily_features = pd.DataFrame(columns=["local_date"] + colnames) +else: + sleep_daily_features = pd.DataFrame(columns=colnames) + for sleep_type in sleep_types: + sleep_daily_features = dailyFeaturesFromSummaryData(sleep_summary_data, sleep_type) + + sleep_daily_features[colnames_can_be_zero] = sleep_daily_features[colnames_can_be_zero].fillna(0) + + + +if day_segment == "daily": + sleep_daily_features.to_csv(snakemake.output[0]) +else: + ValueError("Sleep summary features are only implemented for daily day segments")