Add fitbit raw data and datetime

2020-01-15 17:18:10 -05:00 · 2020-01-15 17:18:10 -05:00 · 34c4586e4d
parent ad514b5d40
commit 34c4586e4d
6 changed files with 261 additions and 3 deletions
--- a/8
+++ b/8
@ -7,6 +7,7 @@ include: "rules/reports.snakefile"
 rule all:
    input:
        expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["SENSORS"]),
+        expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["FITBIT_TABLE"]),
        expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["SENSORS"]),
        expand("data/processed/{pid}/battery_deltas.csv", pid=config["PIDS"]),
        expand("data/processed/{pid}/screen_deltas.csv", pid=config["PIDS"]),
@ -37,8 +38,11 @@ rule all:
                            pid = config["PIDS"],
                            day_segment = config["LIGHT"]["DAY_SEGMENTS"]),
        expand("data/processed/{pid}/accelerometer_{day_segment}.csv",
-                    pid = config["PIDS"],
-                    day_segment = config["ACCELEROMETER"]["DAY_SEGMENTS"]),
+                            pid = config["PIDS"],
+                            day_segment = config["ACCELEROMETER"]["DAY_SEGMENTS"]),
+        expand("data/raw/{pid}/fitbit_{fitbit_sensor}_with_datetime.csv",
+                            pid=config["PIDS"],
+                            fitbit_sensor=config["FITBIT_SENSORS"]),
        # Reports
        expand("reports/figures/{pid}/{sensor}_heatmap_rows.html", pid=config["PIDS"], sensor=config["SENSORS"]),
        expand("reports/figures/{pid}/compliance_heatmap.html", pid=config["PIDS"]),
--- a/config.yaml
+++ b/config.yaml
@ -1,6 +1,9 @@
 # Valid database table names
 SENSORS: [applications_crashes, applications_foreground, applications_notifications, battery, bluetooth, calls, fitbit_data, locations, messages, plugin_ambient_noise, plugin_device_usage, plugin_google_activity_recognition, screen]

+FITBIT_TABLE: [fitbit_data]
+FITBIT_SENSORS: [heartrate, steps, sleep]
+
 # Participants to include in the analysis
 # You must create a file for each participant
 # named pXXX containing their device_id
--- a/rules/preprocessing.snakefile
+++ b/rules/preprocessing.snakefile
@ -15,6 +15,8 @@ rule readable_datetime:
    params:
        timezones = None,
        fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"]
+    wildcard_constraints:
+        sensor = "^fitbit.*"  # ignoring fitbit sensors
    output:
        "data/raw/{pid}/{sensor}_with_datetime.csv"
    script:
@ -66,3 +68,33 @@ rule resample_fused_location:
        "data/raw/{pid}/locations_resampled.csv"
    script:
        "../src/data/resample_fused_location.R"
+
+rule fitbit_heartrate_with_datetime:
+    input:
+        "data/raw/{pid}/fitbit_data_raw.csv"
+    params:
+        local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"],
+    output:
+        "data/raw/{pid}/fitbit_heartrate_with_datetime.csv"
+    script:
+        "../src/data/fitbit_heartrate_with_datetime.py"
+
+rule fitbit_steps_with_datetime:
+    input:
+        "data/raw/{pid}/fitbit_data_raw.csv"
+    params:
+        local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"]
+    output:
+        "data/raw/{pid}/fitbit_steps_with_datetime.csv"
+    script:
+        "../src/data/fitbit_steps_with_datetime.py"
+
+rule fitbit_sleep_with_datetime:
+    input:
+        "data/raw/{pid}/fitbit_data_raw.csv"
+    params:
+        local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"]
+    output:
+        "data/raw/{pid}/fitbit_sleep_with_datetime.csv"
+    script:
+        "../src/data/fitbit_sleep_with_datetime.py"
--- a/src/data/fitbit_heartrate_with_datetime.py
+++ b/src/data/fitbit_heartrate_with_datetime.py
@ -0,0 +1,71 @@
+import pandas as pd
+import pytz, json
+from datetime import datetime
+
+
+NIGHT = "night"
+MORNING = "morning"
+AFTERNOON = "afternoon"
+EVENING = "evening"
+HOUR2EPOCH = [NIGHT] * 6 + [MORNING] * 6 + [AFTERNOON] * 6 + [EVENING] * 6
+
+
+HR_COLUMNS = ("device_id",
+              "heartrate",
+              "local_date_time",
+              "local_date",
+              "local_month",
+              "local_day",
+              "local_day_of_week",
+              "local_time",
+              "local_hour",
+              "local_minute",
+              "local_day_segment")
+
+fitbit_data = pd.read_csv(snakemake.input[0])
+heartrate_data = fitbit_data[fitbit_data["fitbit_data_type"] == "heartrate"]
+
+local_timezone = pytz.timezone(snakemake.params["local_timezone"])
+
+
+"""
+Data is pulled in intraday manner. Since data will be duplicated until the
+last record from that day, first sort by time, then drop all but
+the last record for each day. Drop duplicates based on aware timestamp.
+"""
+local_date_col = heartrate_data["timestamp"].apply(lambda ts: str(datetime.fromtimestamp(ts/1000, tz=local_timezone).date()))
+heartrate_data = heartrate_data.assign(local_date=local_date_col.values)
+heartrate_data.sort_values(by="timestamp", ascending=True, inplace=True)
+heartrate_data.drop_duplicates(subset="local_date", keep="last", inplace=True)
+
+device_id = heartrate_data["device_id"].iloc[0]
+records = []
+# Parse JSON into individual records
+for record in heartrate_data.fitbit_data:
+    record = json.loads(record)  # Parse text into JSON
+    curr_date = datetime.strptime(record["activities-heart"][0]["dateTime"], "%Y-%m-%d")
+    dataset = record["activities-heart-intraday"]["dataset"]
+    for data in dataset:
+        d_time = datetime.strptime(data["time"], '%H:%M:%S').time()
+        d_datetime = datetime.combine(curr_date, d_time)
+
+        # Create tuple of parsed data
+        row = (device_id,
+               data["value"],
+               d_datetime,
+               d_datetime.date(),
+               d_datetime.month,
+               d_datetime.day,
+               d_datetime.weekday(),
+               d_datetime.time(),
+               d_datetime.hour,
+               d_datetime.minute,
+               HOUR2EPOCH[d_datetime.hour])
+
+        # Append the data to a list
+        records.append(row)
+
+# Create a new DataFrame from the list of tuples.
+heartrate_preprocessed = pd.DataFrame(data=records, columns=HR_COLUMNS)
+
+heartrate_preprocessed.to_csv(snakemake.output[0], index=False)
--- a/src/data/fitbit_sleep_with_datetime.py
+++ b/src/data/fitbit_sleep_with_datetime.py
@ -0,0 +1,76 @@
+import pandas as pd
+import pytz, json
+from datetime import datetime
+
+
+
+NIGHT = "night"
+MORNING = "morning"
+AFTERNOON = "afternoon"
+EVENING = "evening"
+HOUR2EPOCH = [NIGHT] * 6 + [MORNING] * 6 + [AFTERNOON] * 6 + [EVENING] * 6
+
+
+SLEEP_COLUMNS = ("device_id",
+                 "sleep", # 1: "asleep", 2: "restless", or 3: "awake"
+                 "local_date_time",
+                 "local_date",
+                 "local_month",
+                 "local_day",
+                 "local_day_of_week",
+                 "local_time",
+                 "local_hour",
+                 "local_minute",
+                 "local_day_segment")
+
+fitbit_data = pd.read_csv(snakemake.input[0])
+sleep_data = fitbit_data[fitbit_data["fitbit_data_type"] == "sleep"]
+
+local_timezone = pytz.timezone(snakemake.params["local_timezone"])
+
+
+"""
+Data is pulled in intraday manner. Since data will be duplicated until the
+last record from that day, first sort by time, then drop all but
+the last record for each day. Drop duplicates based on aware timestamp.
+"""
+local_date_col = sleep_data["timestamp"].apply(lambda ts: str(datetime.fromtimestamp(ts/1000, tz=local_timezone).date()))
+sleep_data = sleep_data.assign(local_date=local_date_col.values)
+sleep_data.sort_values(by="timestamp", ascending=True, inplace=True)
+sleep_data.drop_duplicates(subset="local_date", keep="last", inplace=True)
+
+device_id = sleep_data["device_id"].iloc[0]
+records = []
+# Parse JSON into individual records
+for multi_record in sleep_data.fitbit_data:
+    for record in json.loads(multi_record)["sleep"]:
+        start_date = datetime.strptime(record["startTime"][:10], "%Y-%m-%d")
+        end_date = datetime.strptime(record["endTime"][:10], "%Y-%m-%d")
+        flag = 1 if start_date == end_date else 0
+        for data in record["minuteData"]:
+            d_time = datetime.strptime(data["dateTime"], '%H:%M:%S').time()
+            if not flag and not d_time.hour:
+                flag = 1
+            curr_date = end_date if flag else start_date
+            d_datetime = datetime.combine(curr_date, d_time)
+
+            # Create tuple of parsed data
+            row = (device_id,
+                   data["value"],
+                   d_datetime,
+                   d_datetime.date(),
+                   d_datetime.month,
+                   d_datetime.day,
+                   d_datetime.weekday(),
+                   d_datetime.time(),
+                   d_datetime.hour,
+                   d_datetime.minute,
+                   HOUR2EPOCH[d_datetime.hour])
+
+            # Append the data to a list
+            records.append(row)
+
+# Create a new DataFrame from the list of tuples.
+sleep_preprocessed = pd.DataFrame(data=records, columns=SLEEP_COLUMNS)
+
+sleep_preprocessed.to_csv(snakemake.output[0], index=False)
--- a/src/data/fitbit_steps_with_datetime.py
+++ b/src/data/fitbit_steps_with_datetime.py
@ -0,0 +1,72 @@
+import pandas as pd
+import pytz, json
+from datetime import datetime
+
+
+NIGHT = "night"
+MORNING = "morning"
+AFTERNOON = "afternoon"
+EVENING = "evening"
+HOUR2EPOCH = [NIGHT] * 6 + [MORNING] * 6 + [AFTERNOON] * 6 + [EVENING] * 6
+
+
+STEPS_COLUMNS = ("device_id",
+                 "steps",
+                 "local_date_time",
+                 "local_date",
+                 "local_month",
+                 "local_day",
+                 "local_day_of_week",
+                 "local_time",
+                 "local_hour",
+                 "local_minute",
+                 "local_day_segment")
+
+fitbit_data = pd.read_csv(snakemake.input[0])
+steps_data = fitbit_data[fitbit_data["fitbit_data_type"] == "steps"]
+
+local_timezone = pytz.timezone(snakemake.params["local_timezone"])
+
+
+"""
+Data is pulled in intraday manner. Since data will be duplicated until the
+last record from that day, first sort by time, then drop all but
+the last record for each day. Drop duplicates based on aware timestamp.
+"""
+local_date_col = steps_data["timestamp"].apply(lambda ts: str(datetime.fromtimestamp(ts/1000, tz=local_timezone).date()))
+steps_data = steps_data.assign(local_date=local_date_col.values)
+steps_data.sort_values(by="timestamp", ascending=True, inplace=True)
+steps_data.drop_duplicates(subset="local_date", keep="last", inplace=True)
+
+device_id = steps_data["device_id"].iloc[0]
+records = []
+# Parse JSON into individual records
+for record in steps_data.fitbit_data:
+    record = json.loads(record)  # Parse text into JSON
+    curr_date = datetime.strptime(
+        record["activities-steps"][0]["dateTime"], "%Y-%m-%d")
+    dataset = record["activities-steps-intraday"]["dataset"]
+    for data in dataset:
+        d_time = datetime.strptime(data["time"], '%H:%M:%S').time()
+        d_datetime = datetime.combine(curr_date, d_time)
+
+        # Create tuple of parsed data
+        row = (device_id,
+               data["value"],
+               d_datetime,
+               d_datetime.date(),
+               d_datetime.month,
+               d_datetime.day,
+               d_datetime.weekday(),
+               d_datetime.time(),
+               d_datetime.hour,
+               d_datetime.minute,
+               HOUR2EPOCH[d_datetime.hour])
+
+        # Append the data to a list
+        records.append(row)
+
+# Create a new DataFrame from the list of tuples.
+steps_preprocessed = pd.DataFrame(data=records, columns=STEPS_COLUMNS)
+
+steps_preprocessed.to_csv(snakemake.output[0], index=False)