diff --git a/Snakefile b/Snakefile index 035e52c9..a8b6f2b2 100644 --- a/Snakefile +++ b/Snakefile @@ -7,6 +7,7 @@ include: "rules/reports.snakefile" rule all: input: expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["SENSORS"]), + expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["FITBIT_TABLE"]), expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["SENSORS"]), expand("data/processed/{pid}/battery_deltas.csv", pid=config["PIDS"]), expand("data/processed/{pid}/screen_deltas.csv", pid=config["PIDS"]), @@ -37,8 +38,11 @@ rule all: pid = config["PIDS"], day_segment = config["LIGHT"]["DAY_SEGMENTS"]), expand("data/processed/{pid}/accelerometer_{day_segment}.csv", - pid = config["PIDS"], - day_segment = config["ACCELEROMETER"]["DAY_SEGMENTS"]), + pid = config["PIDS"], + day_segment = config["ACCELEROMETER"]["DAY_SEGMENTS"]), + expand("data/raw/{pid}/fitbit_{fitbit_sensor}_with_datetime.csv", + pid=config["PIDS"], + fitbit_sensor=config["FITBIT_SENSORS"]), # Reports expand("reports/figures/{pid}/{sensor}_heatmap_rows.html", pid=config["PIDS"], sensor=config["SENSORS"]), expand("reports/figures/{pid}/compliance_heatmap.html", pid=config["PIDS"]), diff --git a/config.yaml b/config.yaml index 5877d60a..ebd827e2 100644 --- a/config.yaml +++ b/config.yaml @@ -1,6 +1,9 @@ # Valid database table names SENSORS: [applications_crashes, applications_foreground, applications_notifications, battery, bluetooth, calls, fitbit_data, locations, messages, plugin_ambient_noise, plugin_device_usage, plugin_google_activity_recognition, screen] +FITBIT_TABLE: [fitbit_data] +FITBIT_SENSORS: [heartrate, steps, sleep] + # Participants to include in the analysis # You must create a file for each participant # named pXXX containing their device_id diff --git a/rules/preprocessing.snakefile b/rules/preprocessing.snakefile index 29bc3b06..190b3bf3 100644 --- a/rules/preprocessing.snakefile +++ b/rules/preprocessing.snakefile @@ -15,6 +15,8 @@ rule readable_datetime: params: timezones = None, fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"] + wildcard_constraints: + sensor = "^fitbit.*" # ignoring fitbit sensors output: "data/raw/{pid}/{sensor}_with_datetime.csv" script: @@ -65,4 +67,34 @@ rule resample_fused_location: output: "data/raw/{pid}/locations_resampled.csv" script: - "../src/data/resample_fused_location.R" \ No newline at end of file + "../src/data/resample_fused_location.R" + +rule fitbit_heartrate_with_datetime: + input: + "data/raw/{pid}/fitbit_data_raw.csv" + params: + local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"], + output: + "data/raw/{pid}/fitbit_heartrate_with_datetime.csv" + script: + "../src/data/fitbit_heartrate_with_datetime.py" + +rule fitbit_steps_with_datetime: + input: + "data/raw/{pid}/fitbit_data_raw.csv" + params: + local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"] + output: + "data/raw/{pid}/fitbit_steps_with_datetime.csv" + script: + "../src/data/fitbit_steps_with_datetime.py" + +rule fitbit_sleep_with_datetime: + input: + "data/raw/{pid}/fitbit_data_raw.csv" + params: + local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"] + output: + "data/raw/{pid}/fitbit_sleep_with_datetime.csv" + script: + "../src/data/fitbit_sleep_with_datetime.py" \ No newline at end of file diff --git a/src/data/fitbit_heartrate_with_datetime.py b/src/data/fitbit_heartrate_with_datetime.py new file mode 100644 index 00000000..18994d51 --- /dev/null +++ b/src/data/fitbit_heartrate_with_datetime.py @@ -0,0 +1,71 @@ +import pandas as pd +import pytz, json +from datetime import datetime + + +NIGHT = "night" +MORNING = "morning" +AFTERNOON = "afternoon" +EVENING = "evening" +HOUR2EPOCH = [NIGHT] * 6 + [MORNING] * 6 + [AFTERNOON] * 6 + [EVENING] * 6 + + +HR_COLUMNS = ("device_id", + "heartrate", + "local_date_time", + "local_date", + "local_month", + "local_day", + "local_day_of_week", + "local_time", + "local_hour", + "local_minute", + "local_day_segment") + +fitbit_data = pd.read_csv(snakemake.input[0]) +heartrate_data = fitbit_data[fitbit_data["fitbit_data_type"] == "heartrate"] + +local_timezone = pytz.timezone(snakemake.params["local_timezone"]) + + +""" +Data is pulled in intraday manner. Since data will be duplicated until the +last record from that day, first sort by time, then drop all but +the last record for each day. Drop duplicates based on aware timestamp. +""" +local_date_col = heartrate_data["timestamp"].apply(lambda ts: str(datetime.fromtimestamp(ts/1000, tz=local_timezone).date())) +heartrate_data = heartrate_data.assign(local_date=local_date_col.values) +heartrate_data.sort_values(by="timestamp", ascending=True, inplace=True) +heartrate_data.drop_duplicates(subset="local_date", keep="last", inplace=True) + +device_id = heartrate_data["device_id"].iloc[0] +records = [] +# Parse JSON into individual records +for record in heartrate_data.fitbit_data: + record = json.loads(record) # Parse text into JSON + curr_date = datetime.strptime(record["activities-heart"][0]["dateTime"], "%Y-%m-%d") + dataset = record["activities-heart-intraday"]["dataset"] + for data in dataset: + d_time = datetime.strptime(data["time"], '%H:%M:%S').time() + d_datetime = datetime.combine(curr_date, d_time) + + # Create tuple of parsed data + row = (device_id, + data["value"], + d_datetime, + d_datetime.date(), + d_datetime.month, + d_datetime.day, + d_datetime.weekday(), + d_datetime.time(), + d_datetime.hour, + d_datetime.minute, + HOUR2EPOCH[d_datetime.hour]) + + # Append the data to a list + records.append(row) + +# Create a new DataFrame from the list of tuples. +heartrate_preprocessed = pd.DataFrame(data=records, columns=HR_COLUMNS) + +heartrate_preprocessed.to_csv(snakemake.output[0], index=False) diff --git a/src/data/fitbit_sleep_with_datetime.py b/src/data/fitbit_sleep_with_datetime.py new file mode 100644 index 00000000..213f00fa --- /dev/null +++ b/src/data/fitbit_sleep_with_datetime.py @@ -0,0 +1,76 @@ +import pandas as pd +import pytz, json +from datetime import datetime + + + +NIGHT = "night" +MORNING = "morning" +AFTERNOON = "afternoon" +EVENING = "evening" +HOUR2EPOCH = [NIGHT] * 6 + [MORNING] * 6 + [AFTERNOON] * 6 + [EVENING] * 6 + + +SLEEP_COLUMNS = ("device_id", + "sleep", # 1: "asleep", 2: "restless", or 3: "awake" + "local_date_time", + "local_date", + "local_month", + "local_day", + "local_day_of_week", + "local_time", + "local_hour", + "local_minute", + "local_day_segment") + +fitbit_data = pd.read_csv(snakemake.input[0]) +sleep_data = fitbit_data[fitbit_data["fitbit_data_type"] == "sleep"] + +local_timezone = pytz.timezone(snakemake.params["local_timezone"]) + + +""" +Data is pulled in intraday manner. Since data will be duplicated until the +last record from that day, first sort by time, then drop all but +the last record for each day. Drop duplicates based on aware timestamp. +""" +local_date_col = sleep_data["timestamp"].apply(lambda ts: str(datetime.fromtimestamp(ts/1000, tz=local_timezone).date())) +sleep_data = sleep_data.assign(local_date=local_date_col.values) +sleep_data.sort_values(by="timestamp", ascending=True, inplace=True) +sleep_data.drop_duplicates(subset="local_date", keep="last", inplace=True) + +device_id = sleep_data["device_id"].iloc[0] +records = [] +# Parse JSON into individual records +for multi_record in sleep_data.fitbit_data: + for record in json.loads(multi_record)["sleep"]: + start_date = datetime.strptime(record["startTime"][:10], "%Y-%m-%d") + end_date = datetime.strptime(record["endTime"][:10], "%Y-%m-%d") + flag = 1 if start_date == end_date else 0 + for data in record["minuteData"]: + d_time = datetime.strptime(data["dateTime"], '%H:%M:%S').time() + if not flag and not d_time.hour: + flag = 1 + curr_date = end_date if flag else start_date + d_datetime = datetime.combine(curr_date, d_time) + + # Create tuple of parsed data + row = (device_id, + data["value"], + d_datetime, + d_datetime.date(), + d_datetime.month, + d_datetime.day, + d_datetime.weekday(), + d_datetime.time(), + d_datetime.hour, + d_datetime.minute, + HOUR2EPOCH[d_datetime.hour]) + + # Append the data to a list + records.append(row) + +# Create a new DataFrame from the list of tuples. +sleep_preprocessed = pd.DataFrame(data=records, columns=SLEEP_COLUMNS) + +sleep_preprocessed.to_csv(snakemake.output[0], index=False) diff --git a/src/data/fitbit_steps_with_datetime.py b/src/data/fitbit_steps_with_datetime.py new file mode 100644 index 00000000..e741c4a2 --- /dev/null +++ b/src/data/fitbit_steps_with_datetime.py @@ -0,0 +1,72 @@ +import pandas as pd +import pytz, json +from datetime import datetime + + +NIGHT = "night" +MORNING = "morning" +AFTERNOON = "afternoon" +EVENING = "evening" +HOUR2EPOCH = [NIGHT] * 6 + [MORNING] * 6 + [AFTERNOON] * 6 + [EVENING] * 6 + + +STEPS_COLUMNS = ("device_id", + "steps", + "local_date_time", + "local_date", + "local_month", + "local_day", + "local_day_of_week", + "local_time", + "local_hour", + "local_minute", + "local_day_segment") + +fitbit_data = pd.read_csv(snakemake.input[0]) +steps_data = fitbit_data[fitbit_data["fitbit_data_type"] == "steps"] + +local_timezone = pytz.timezone(snakemake.params["local_timezone"]) + + +""" +Data is pulled in intraday manner. Since data will be duplicated until the +last record from that day, first sort by time, then drop all but +the last record for each day. Drop duplicates based on aware timestamp. +""" +local_date_col = steps_data["timestamp"].apply(lambda ts: str(datetime.fromtimestamp(ts/1000, tz=local_timezone).date())) +steps_data = steps_data.assign(local_date=local_date_col.values) +steps_data.sort_values(by="timestamp", ascending=True, inplace=True) +steps_data.drop_duplicates(subset="local_date", keep="last", inplace=True) + +device_id = steps_data["device_id"].iloc[0] +records = [] +# Parse JSON into individual records +for record in steps_data.fitbit_data: + record = json.loads(record) # Parse text into JSON + curr_date = datetime.strptime( + record["activities-steps"][0]["dateTime"], "%Y-%m-%d") + dataset = record["activities-steps-intraday"]["dataset"] + for data in dataset: + d_time = datetime.strptime(data["time"], '%H:%M:%S').time() + d_datetime = datetime.combine(curr_date, d_time) + + # Create tuple of parsed data + row = (device_id, + data["value"], + d_datetime, + d_datetime.date(), + d_datetime.month, + d_datetime.day, + d_datetime.weekday(), + d_datetime.time(), + d_datetime.hour, + d_datetime.minute, + HOUR2EPOCH[d_datetime.hour]) + + # Append the data to a list + records.append(row) + +# Create a new DataFrame from the list of tuples. +steps_preprocessed = pd.DataFrame(data=records, columns=STEPS_COLUMNS) + +steps_preprocessed.to_csv(snakemake.output[0], index=False)