Combine all fitbit sensors in one script and one rule

replace/81e28a29d33fb01834b863c02c114ad035880a90
JulioV 2020-01-16 17:20:04 -05:00
parent 3cdd0487f1
commit 4aec2c4032
5 changed files with 155 additions and 243 deletions

View File

@ -16,7 +16,7 @@ rule readable_datetime:
timezones = None,
fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"]
wildcard_constraints:
sensor = "^fitbit.*" # ignoring fitbit sensors
sensor = '(' + '|'.join([re.escape(x) for x in config["SENSORS"]]) + ')' # only process smartphone sensors, not fitbit
output:
"data/raw/{pid}/{sensor}_with_datetime.csv"
script:
@ -82,33 +82,14 @@ rule application_genres:
script:
"../src/data/application_genres.R"
rule fitbit_heartrate_with_datetime:
rule fitbit_with_datetime:
input:
"data/raw/{pid}/fitbit_data_raw.csv"
params:
local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"],
fitbit_sensor = "{fitbit_sensor}"
output:
"data/raw/{pid}/fitbit_heartrate_with_datetime.csv"
"data/raw/{pid}/fitbit_{fitbit_sensor}_with_datetime.csv"
script:
"../src/data/fitbit_heartrate_with_datetime.py"
rule fitbit_steps_with_datetime:
input:
"data/raw/{pid}/fitbit_data_raw.csv"
params:
local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"]
output:
"data/raw/{pid}/fitbit_steps_with_datetime.csv"
script:
"../src/data/fitbit_steps_with_datetime.py"
rule fitbit_sleep_with_datetime:
input:
"data/raw/{pid}/fitbit_data_raw.csv"
params:
local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"]
output:
"data/raw/{pid}/fitbit_sleep_with_datetime.csv"
script:
"../src/data/fitbit_sleep_with_datetime.py"
"../src/data/fitbit_readable_datetime.py"

View File

@ -1,71 +0,0 @@
import pandas as pd
import pytz, json
from datetime import datetime
NIGHT = "night"
MORNING = "morning"
AFTERNOON = "afternoon"
EVENING = "evening"
HOUR2EPOCH = [NIGHT] * 6 + [MORNING] * 6 + [AFTERNOON] * 6 + [EVENING] * 6
HR_COLUMNS = ("device_id",
"heartrate",
"local_date_time",
"local_date",
"local_month",
"local_day",
"local_day_of_week",
"local_time",
"local_hour",
"local_minute",
"local_day_segment")
fitbit_data = pd.read_csv(snakemake.input[0])
heartrate_data = fitbit_data[fitbit_data["fitbit_data_type"] == "heartrate"]
local_timezone = pytz.timezone(snakemake.params["local_timezone"])
"""
Data is pulled in intraday manner. Since data will be duplicated until the
last record from that day, first sort by time, then drop all but
the last record for each day. Drop duplicates based on aware timestamp.
"""
local_date_col = heartrate_data["timestamp"].apply(lambda ts: str(datetime.fromtimestamp(ts/1000, tz=local_timezone).date()))
heartrate_data = heartrate_data.assign(local_date=local_date_col.values)
heartrate_data.sort_values(by="timestamp", ascending=True, inplace=True)
heartrate_data.drop_duplicates(subset="local_date", keep="last", inplace=True)
device_id = heartrate_data["device_id"].iloc[0]
records = []
# Parse JSON into individual records
for record in heartrate_data.fitbit_data:
record = json.loads(record) # Parse text into JSON
curr_date = datetime.strptime(record["activities-heart"][0]["dateTime"], "%Y-%m-%d")
dataset = record["activities-heart-intraday"]["dataset"]
for data in dataset:
d_time = datetime.strptime(data["time"], '%H:%M:%S').time()
d_datetime = datetime.combine(curr_date, d_time)
# Create tuple of parsed data
row = (device_id,
data["value"],
d_datetime,
d_datetime.date(),
d_datetime.month,
d_datetime.day,
d_datetime.weekday(),
d_datetime.time(),
d_datetime.hour,
d_datetime.minute,
HOUR2EPOCH[d_datetime.hour])
# Append the data to a list
records.append(row)
# Create a new DataFrame from the list of tuples.
heartrate_preprocessed = pd.DataFrame(data=records, columns=HR_COLUMNS)
heartrate_preprocessed.to_csv(snakemake.output[0], index=False)

View File

@ -0,0 +1,150 @@
import pandas as pd
import pytz, json
from datetime import datetime
NIGHT = "night"
MORNING = "morning"
AFTERNOON = "afternoon"
EVENING = "evening"
HOUR2EPOCH = [NIGHT] * 6 + [MORNING] * 6 + [AFTERNOON] * 6 + [EVENING] * 6
HR_COLUMNS = ("device_id",
"heartrate",
"local_date_time", "local_date", "local_month", "local_day",
"local_day_of_week", "local_time", "local_hour", "local_minute",
"local_day_segment")
SLEEP_COLUMNS = ("device_id",
"sleep", # 1: "asleep", 2: "restless", or 3: "awake"
"local_date_time", "local_date", "local_month", "local_day",
"local_day_of_week", "local_time", "local_hour", "local_minute",
"local_day_segment")
STEPS_COLUMNS = ("device_id",
"steps",
"local_date_time", "local_date", "local_month", "local_day",
"local_day_of_week", "local_time", "local_hour", "local_minute",
"local_day_segment")
def drop_duplicates(data, local_timezone):
"""
Data is pulled in intraday manner. Since data will be duplicated until the
last record from that day, first sort by time, then drop all but
the last record for each day. Drop duplicates based on aware timestamp.
"""
local_date_col = data["timestamp"].apply(lambda ts: str(datetime.fromtimestamp(ts/1000, tz=local_timezone).date()))
data = data.assign(local_date=local_date_col.values)
data.sort_values(by="timestamp", ascending=True, inplace=True)
data.drop_duplicates(subset="local_date", keep="last", inplace=True)
return data
def parse_steps_data(steps_data):
device_id = steps_data["device_id"].iloc[0]
records = []
# Parse JSON into individual records
for record in steps_data.fitbit_data:
record = json.loads(record) # Parse text into JSON
curr_date = datetime.strptime(
record["activities-steps"][0]["dateTime"], "%Y-%m-%d")
dataset = record["activities-steps-intraday"]["dataset"]
for data in dataset:
d_time = datetime.strptime(data["time"], '%H:%M:%S').time()
d_datetime = datetime.combine(curr_date, d_time)
row = (device_id,
data["value"],
d_datetime,
d_datetime.date(),
d_datetime.month,
d_datetime.day,
d_datetime.weekday(),
d_datetime.time(),
d_datetime.hour,
d_datetime.minute,
HOUR2EPOCH[d_datetime.hour])
records.append(row)
return pd.DataFrame(data=records, columns=STEPS_COLUMNS)
def parse_sleep_data(sleep_data):
device_id = sleep_data["device_id"].iloc[0]
records = []
# Parse JSON into individual records
for multi_record in sleep_data.fitbit_data:
for record in json.loads(multi_record)["sleep"]:
# Compute date when sleep episodes span two days
start_date = datetime.strptime(record["startTime"][:10], "%Y-%m-%d")
end_date = datetime.strptime(record["endTime"][:10], "%Y-%m-%d")
flag = 1 if start_date == end_date else 0
for data in record["minuteData"]:
d_time = datetime.strptime(data["dateTime"], '%H:%M:%S').time()
if not flag and not d_time.hour:
flag = 1
curr_date = end_date if flag else start_date
d_datetime = datetime.combine(curr_date, d_time)
row = (device_id,
data["value"],
d_datetime,
d_datetime.date(),
d_datetime.month,
d_datetime.day,
d_datetime.weekday(),
d_datetime.time(),
d_datetime.hour,
d_datetime.minute,
HOUR2EPOCH[d_datetime.hour])
records.append(row)
return pd.DataFrame(data=records, columns=SLEEP_COLUMNS)
def parse_heartrate_data(heartrate_data):
device_id = heartrate_data["device_id"].iloc[0]
records = []
# Parse JSON into individual records
for record in heartrate_data.fitbit_data:
record = json.loads(record) # Parse text into JSON
curr_date = datetime.strptime(record["activities-heart"][0]["dateTime"], "%Y-%m-%d")
dataset = record["activities-heart-intraday"]["dataset"]
for data in dataset:
d_time = datetime.strptime(data["time"], '%H:%M:%S').time()
d_datetime = datetime.combine(curr_date, d_time)
row = (device_id,
data["value"],
d_datetime,
d_datetime.date(),
d_datetime.month,
d_datetime.day,
d_datetime.weekday(),
d_datetime.time(),
d_datetime.hour,
d_datetime.minute,
HOUR2EPOCH[d_datetime.hour])
records.append(row)
return pd.DataFrame(data=records, columns=HR_COLUMNS)
fitbit_data = pd.read_csv(snakemake.input[0])
local_timezone = pytz.timezone(snakemake.params["local_timezone"])
sensor = snakemake.params["fitbit_sensor"]
data = fitbit_data[fitbit_data["fitbit_data_type"] == sensor]
data = drop_duplicates(data, local_timezone)
if sensor == "heartrate":
data_preprocesed = parse_heartrate_data(data)
elif sensor == "sleep":
data_preprocesed = parse_sleep_data(data)
elif sensor == "steps":
data_preprocesed = parse_steps_data(data)
data_preprocesed.to_csv(snakemake.output[0], index=False)

View File

@ -1,76 +0,0 @@
import pandas as pd
import pytz, json
from datetime import datetime
NIGHT = "night"
MORNING = "morning"
AFTERNOON = "afternoon"
EVENING = "evening"
HOUR2EPOCH = [NIGHT] * 6 + [MORNING] * 6 + [AFTERNOON] * 6 + [EVENING] * 6
SLEEP_COLUMNS = ("device_id",
"sleep", # 1: "asleep", 2: "restless", or 3: "awake"
"local_date_time",
"local_date",
"local_month",
"local_day",
"local_day_of_week",
"local_time",
"local_hour",
"local_minute",
"local_day_segment")
fitbit_data = pd.read_csv(snakemake.input[0])
sleep_data = fitbit_data[fitbit_data["fitbit_data_type"] == "sleep"]
local_timezone = pytz.timezone(snakemake.params["local_timezone"])
"""
Data is pulled in intraday manner. Since data will be duplicated until the
last record from that day, first sort by time, then drop all but
the last record for each day. Drop duplicates based on aware timestamp.
"""
local_date_col = sleep_data["timestamp"].apply(lambda ts: str(datetime.fromtimestamp(ts/1000, tz=local_timezone).date()))
sleep_data = sleep_data.assign(local_date=local_date_col.values)
sleep_data.sort_values(by="timestamp", ascending=True, inplace=True)
sleep_data.drop_duplicates(subset="local_date", keep="last", inplace=True)
device_id = sleep_data["device_id"].iloc[0]
records = []
# Parse JSON into individual records
for multi_record in sleep_data.fitbit_data:
for record in json.loads(multi_record)["sleep"]:
start_date = datetime.strptime(record["startTime"][:10], "%Y-%m-%d")
end_date = datetime.strptime(record["endTime"][:10], "%Y-%m-%d")
flag = 1 if start_date == end_date else 0
for data in record["minuteData"]:
d_time = datetime.strptime(data["dateTime"], '%H:%M:%S').time()
if not flag and not d_time.hour:
flag = 1
curr_date = end_date if flag else start_date
d_datetime = datetime.combine(curr_date, d_time)
# Create tuple of parsed data
row = (device_id,
data["value"],
d_datetime,
d_datetime.date(),
d_datetime.month,
d_datetime.day,
d_datetime.weekday(),
d_datetime.time(),
d_datetime.hour,
d_datetime.minute,
HOUR2EPOCH[d_datetime.hour])
# Append the data to a list
records.append(row)
# Create a new DataFrame from the list of tuples.
sleep_preprocessed = pd.DataFrame(data=records, columns=SLEEP_COLUMNS)
sleep_preprocessed.to_csv(snakemake.output[0], index=False)

View File

@ -1,72 +0,0 @@
import pandas as pd
import pytz, json
from datetime import datetime
NIGHT = "night"
MORNING = "morning"
AFTERNOON = "afternoon"
EVENING = "evening"
HOUR2EPOCH = [NIGHT] * 6 + [MORNING] * 6 + [AFTERNOON] * 6 + [EVENING] * 6
STEPS_COLUMNS = ("device_id",
"steps",
"local_date_time",
"local_date",
"local_month",
"local_day",
"local_day_of_week",
"local_time",
"local_hour",
"local_minute",
"local_day_segment")
fitbit_data = pd.read_csv(snakemake.input[0])
steps_data = fitbit_data[fitbit_data["fitbit_data_type"] == "steps"]
local_timezone = pytz.timezone(snakemake.params["local_timezone"])
"""
Data is pulled in intraday manner. Since data will be duplicated until the
last record from that day, first sort by time, then drop all but
the last record for each day. Drop duplicates based on aware timestamp.
"""
local_date_col = steps_data["timestamp"].apply(lambda ts: str(datetime.fromtimestamp(ts/1000, tz=local_timezone).date()))
steps_data = steps_data.assign(local_date=local_date_col.values)
steps_data.sort_values(by="timestamp", ascending=True, inplace=True)
steps_data.drop_duplicates(subset="local_date", keep="last", inplace=True)
device_id = steps_data["device_id"].iloc[0]
records = []
# Parse JSON into individual records
for record in steps_data.fitbit_data:
record = json.loads(record) # Parse text into JSON
curr_date = datetime.strptime(
record["activities-steps"][0]["dateTime"], "%Y-%m-%d")
dataset = record["activities-steps-intraday"]["dataset"]
for data in dataset:
d_time = datetime.strptime(data["time"], '%H:%M:%S').time()
d_datetime = datetime.combine(curr_date, d_time)
# Create tuple of parsed data
row = (device_id,
data["value"],
d_datetime,
d_datetime.date(),
d_datetime.month,
d_datetime.day,
d_datetime.weekday(),
d_datetime.time(),
d_datetime.hour,
d_datetime.minute,
HOUR2EPOCH[d_datetime.hour])
# Append the data to a list
records.append(row)
# Create a new DataFrame from the list of tuples.
steps_preprocessed = pd.DataFrame(data=records, columns=STEPS_COLUMNS)
steps_preprocessed.to_csv(snakemake.output[0], index=False)