Add fitbit raw data and datetime

replace/9708685c870fa6a1a9e6565e58db888ee56dd66c
Meng Li 2020-01-15 17:18:10 -05:00
parent ad514b5d40
commit 34c4586e4d
6 changed files with 261 additions and 3 deletions

View File

@ -7,6 +7,7 @@ include: "rules/reports.snakefile"
rule all:
input:
expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["SENSORS"]),
expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["FITBIT_TABLE"]),
expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["SENSORS"]),
expand("data/processed/{pid}/battery_deltas.csv", pid=config["PIDS"]),
expand("data/processed/{pid}/screen_deltas.csv", pid=config["PIDS"]),
@ -37,8 +38,11 @@ rule all:
pid = config["PIDS"],
day_segment = config["LIGHT"]["DAY_SEGMENTS"]),
expand("data/processed/{pid}/accelerometer_{day_segment}.csv",
pid = config["PIDS"],
day_segment = config["ACCELEROMETER"]["DAY_SEGMENTS"]),
pid = config["PIDS"],
day_segment = config["ACCELEROMETER"]["DAY_SEGMENTS"]),
expand("data/raw/{pid}/fitbit_{fitbit_sensor}_with_datetime.csv",
pid=config["PIDS"],
fitbit_sensor=config["FITBIT_SENSORS"]),
# Reports
expand("reports/figures/{pid}/{sensor}_heatmap_rows.html", pid=config["PIDS"], sensor=config["SENSORS"]),
expand("reports/figures/{pid}/compliance_heatmap.html", pid=config["PIDS"]),

View File

@ -1,6 +1,9 @@
# Valid database table names
SENSORS: [applications_crashes, applications_foreground, applications_notifications, battery, bluetooth, calls, fitbit_data, locations, messages, plugin_ambient_noise, plugin_device_usage, plugin_google_activity_recognition, screen]
FITBIT_TABLE: [fitbit_data]
FITBIT_SENSORS: [heartrate, steps, sleep]
# Participants to include in the analysis
# You must create a file for each participant
# named pXXX containing their device_id

View File

@ -15,6 +15,8 @@ rule readable_datetime:
params:
timezones = None,
fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"]
wildcard_constraints:
sensor = "^fitbit.*" # ignoring fitbit sensors
output:
"data/raw/{pid}/{sensor}_with_datetime.csv"
script:
@ -65,4 +67,34 @@ rule resample_fused_location:
output:
"data/raw/{pid}/locations_resampled.csv"
script:
"../src/data/resample_fused_location.R"
"../src/data/resample_fused_location.R"
rule fitbit_heartrate_with_datetime:
input:
"data/raw/{pid}/fitbit_data_raw.csv"
params:
local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"],
output:
"data/raw/{pid}/fitbit_heartrate_with_datetime.csv"
script:
"../src/data/fitbit_heartrate_with_datetime.py"
rule fitbit_steps_with_datetime:
input:
"data/raw/{pid}/fitbit_data_raw.csv"
params:
local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"]
output:
"data/raw/{pid}/fitbit_steps_with_datetime.csv"
script:
"../src/data/fitbit_steps_with_datetime.py"
rule fitbit_sleep_with_datetime:
input:
"data/raw/{pid}/fitbit_data_raw.csv"
params:
local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"]
output:
"data/raw/{pid}/fitbit_sleep_with_datetime.csv"
script:
"../src/data/fitbit_sleep_with_datetime.py"

View File

@ -0,0 +1,71 @@
import pandas as pd
import pytz, json
from datetime import datetime
NIGHT = "night"
MORNING = "morning"
AFTERNOON = "afternoon"
EVENING = "evening"
HOUR2EPOCH = [NIGHT] * 6 + [MORNING] * 6 + [AFTERNOON] * 6 + [EVENING] * 6
HR_COLUMNS = ("device_id",
"heartrate",
"local_date_time",
"local_date",
"local_month",
"local_day",
"local_day_of_week",
"local_time",
"local_hour",
"local_minute",
"local_day_segment")
fitbit_data = pd.read_csv(snakemake.input[0])
heartrate_data = fitbit_data[fitbit_data["fitbit_data_type"] == "heartrate"]
local_timezone = pytz.timezone(snakemake.params["local_timezone"])
"""
Data is pulled in intraday manner. Since data will be duplicated until the
last record from that day, first sort by time, then drop all but
the last record for each day. Drop duplicates based on aware timestamp.
"""
local_date_col = heartrate_data["timestamp"].apply(lambda ts: str(datetime.fromtimestamp(ts/1000, tz=local_timezone).date()))
heartrate_data = heartrate_data.assign(local_date=local_date_col.values)
heartrate_data.sort_values(by="timestamp", ascending=True, inplace=True)
heartrate_data.drop_duplicates(subset="local_date", keep="last", inplace=True)
device_id = heartrate_data["device_id"].iloc[0]
records = []
# Parse JSON into individual records
for record in heartrate_data.fitbit_data:
record = json.loads(record) # Parse text into JSON
curr_date = datetime.strptime(record["activities-heart"][0]["dateTime"], "%Y-%m-%d")
dataset = record["activities-heart-intraday"]["dataset"]
for data in dataset:
d_time = datetime.strptime(data["time"], '%H:%M:%S').time()
d_datetime = datetime.combine(curr_date, d_time)
# Create tuple of parsed data
row = (device_id,
data["value"],
d_datetime,
d_datetime.date(),
d_datetime.month,
d_datetime.day,
d_datetime.weekday(),
d_datetime.time(),
d_datetime.hour,
d_datetime.minute,
HOUR2EPOCH[d_datetime.hour])
# Append the data to a list
records.append(row)
# Create a new DataFrame from the list of tuples.
heartrate_preprocessed = pd.DataFrame(data=records, columns=HR_COLUMNS)
heartrate_preprocessed.to_csv(snakemake.output[0], index=False)

View File

@ -0,0 +1,76 @@
import pandas as pd
import pytz, json
from datetime import datetime
NIGHT = "night"
MORNING = "morning"
AFTERNOON = "afternoon"
EVENING = "evening"
HOUR2EPOCH = [NIGHT] * 6 + [MORNING] * 6 + [AFTERNOON] * 6 + [EVENING] * 6
SLEEP_COLUMNS = ("device_id",
"sleep", # 1: "asleep", 2: "restless", or 3: "awake"
"local_date_time",
"local_date",
"local_month",
"local_day",
"local_day_of_week",
"local_time",
"local_hour",
"local_minute",
"local_day_segment")
fitbit_data = pd.read_csv(snakemake.input[0])
sleep_data = fitbit_data[fitbit_data["fitbit_data_type"] == "sleep"]
local_timezone = pytz.timezone(snakemake.params["local_timezone"])
"""
Data is pulled in intraday manner. Since data will be duplicated until the
last record from that day, first sort by time, then drop all but
the last record for each day. Drop duplicates based on aware timestamp.
"""
local_date_col = sleep_data["timestamp"].apply(lambda ts: str(datetime.fromtimestamp(ts/1000, tz=local_timezone).date()))
sleep_data = sleep_data.assign(local_date=local_date_col.values)
sleep_data.sort_values(by="timestamp", ascending=True, inplace=True)
sleep_data.drop_duplicates(subset="local_date", keep="last", inplace=True)
device_id = sleep_data["device_id"].iloc[0]
records = []
# Parse JSON into individual records
for multi_record in sleep_data.fitbit_data:
for record in json.loads(multi_record)["sleep"]:
start_date = datetime.strptime(record["startTime"][:10], "%Y-%m-%d")
end_date = datetime.strptime(record["endTime"][:10], "%Y-%m-%d")
flag = 1 if start_date == end_date else 0
for data in record["minuteData"]:
d_time = datetime.strptime(data["dateTime"], '%H:%M:%S').time()
if not flag and not d_time.hour:
flag = 1
curr_date = end_date if flag else start_date
d_datetime = datetime.combine(curr_date, d_time)
# Create tuple of parsed data
row = (device_id,
data["value"],
d_datetime,
d_datetime.date(),
d_datetime.month,
d_datetime.day,
d_datetime.weekday(),
d_datetime.time(),
d_datetime.hour,
d_datetime.minute,
HOUR2EPOCH[d_datetime.hour])
# Append the data to a list
records.append(row)
# Create a new DataFrame from the list of tuples.
sleep_preprocessed = pd.DataFrame(data=records, columns=SLEEP_COLUMNS)
sleep_preprocessed.to_csv(snakemake.output[0], index=False)

View File

@ -0,0 +1,72 @@
import pandas as pd
import pytz, json
from datetime import datetime
NIGHT = "night"
MORNING = "morning"
AFTERNOON = "afternoon"
EVENING = "evening"
HOUR2EPOCH = [NIGHT] * 6 + [MORNING] * 6 + [AFTERNOON] * 6 + [EVENING] * 6
STEPS_COLUMNS = ("device_id",
"steps",
"local_date_time",
"local_date",
"local_month",
"local_day",
"local_day_of_week",
"local_time",
"local_hour",
"local_minute",
"local_day_segment")
fitbit_data = pd.read_csv(snakemake.input[0])
steps_data = fitbit_data[fitbit_data["fitbit_data_type"] == "steps"]
local_timezone = pytz.timezone(snakemake.params["local_timezone"])
"""
Data is pulled in intraday manner. Since data will be duplicated until the
last record from that day, first sort by time, then drop all but
the last record for each day. Drop duplicates based on aware timestamp.
"""
local_date_col = steps_data["timestamp"].apply(lambda ts: str(datetime.fromtimestamp(ts/1000, tz=local_timezone).date()))
steps_data = steps_data.assign(local_date=local_date_col.values)
steps_data.sort_values(by="timestamp", ascending=True, inplace=True)
steps_data.drop_duplicates(subset="local_date", keep="last", inplace=True)
device_id = steps_data["device_id"].iloc[0]
records = []
# Parse JSON into individual records
for record in steps_data.fitbit_data:
record = json.loads(record) # Parse text into JSON
curr_date = datetime.strptime(
record["activities-steps"][0]["dateTime"], "%Y-%m-%d")
dataset = record["activities-steps-intraday"]["dataset"]
for data in dataset:
d_time = datetime.strptime(data["time"], '%H:%M:%S').time()
d_datetime = datetime.combine(curr_date, d_time)
# Create tuple of parsed data
row = (device_id,
data["value"],
d_datetime,
d_datetime.date(),
d_datetime.month,
d_datetime.day,
d_datetime.weekday(),
d_datetime.time(),
d_datetime.hour,
d_datetime.minute,
HOUR2EPOCH[d_datetime.hour])
# Append the data to a list
records.append(row)
# Create a new DataFrame from the list of tuples.
steps_preprocessed = pd.DataFrame(data=records, columns=STEPS_COLUMNS)
steps_preprocessed.to_csv(snakemake.output[0], index=False)