Add dates filter while parsing fitbit data

pull/103/head
Meng Li 2020-11-30 12:34:14 -05:00
parent 5178be585d
commit 70991d6667
4 changed files with 53 additions and 21 deletions

View File

@ -176,7 +176,8 @@ rule phone_application_categories:
rule fitbit_parse_heartrate: rule fitbit_parse_heartrate:
input: input:
"data/raw/{pid}/fitbit_heartrate_{fitbit_data_type}_raw.csv" participant_file = "data/external/participant_files/{pid}.yaml",
raw_data = "data/raw/{pid}/fitbit_heartrate_{fitbit_data_type}_raw.csv"
params: params:
timezone = config["FITBIT_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"], timezone = config["FITBIT_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"],
table = lambda wildcards: config["FITBIT_HEARTRATE_"+str(wildcards.fitbit_data_type).upper()]["TABLE"], table = lambda wildcards: config["FITBIT_HEARTRATE_"+str(wildcards.fitbit_data_type).upper()]["TABLE"],
@ -189,7 +190,8 @@ rule fitbit_parse_heartrate:
rule fitbit_parse_steps: rule fitbit_parse_steps:
input: input:
"data/raw/{pid}/fitbit_steps_{fitbit_data_type}_raw.csv" participant_file = "data/external/participant_files/{pid}.yaml",
raw_data = "data/raw/{pid}/fitbit_steps_{fitbit_data_type}_raw.csv"
params: params:
timezone = config["FITBIT_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"], timezone = config["FITBIT_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"],
table = lambda wildcards: config["FITBIT_STEPS_"+str(wildcards.fitbit_data_type).upper()]["TABLE"], table = lambda wildcards: config["FITBIT_STEPS_"+str(wildcards.fitbit_data_type).upper()]["TABLE"],
@ -202,7 +204,8 @@ rule fitbit_parse_steps:
rule fitbit_parse_sleep: rule fitbit_parse_sleep:
input: input:
"data/raw/{pid}/fitbit_sleep_{fitbit_data_type}_raw.csv" participant_file = "data/external/participant_files/{pid}.yaml",
raw_data = "data/raw/{pid}/fitbit_sleep_{fitbit_data_type}_raw.csv"
params: params:
timezone = config["FITBIT_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"], timezone = config["FITBIT_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"],
table = lambda wildcards: config["FITBIT_SLEEP_"+str(wildcards.fitbit_data_type).upper()]["TABLE"], table = lambda wildcards: config["FITBIT_SLEEP_"+str(wildcards.fitbit_data_type).upper()]["TABLE"],

View File

@ -1,4 +1,4 @@
import json, sys import yaml, json, sys
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from datetime import datetime, timezone from datetime import datetime, timezone
@ -93,7 +93,6 @@ def parseHeartrateIntradayData(records_intraday, dataset, device_id, curr_date,
records_intraday.append(row_intraday) records_intraday.append(row_intraday)
return records_intraday return records_intraday
# def append_timestamp(data):
def parseHeartrateData(heartrate_data, fitbit_data_type): def parseHeartrateData(heartrate_data, fitbit_data_type):
@ -132,11 +131,21 @@ timezone = snakemake.params["timezone"]
column_format = snakemake.params["column_format"] column_format = snakemake.params["column_format"]
fitbit_data_type = snakemake.params["fitbit_data_type"] fitbit_data_type = snakemake.params["fitbit_data_type"]
with open(snakemake.input["participant_file"], "r", encoding="utf-8") as f:
participant_file = yaml.safe_load(f)
local_start_date = pd.Timestamp(participant_file["FITBIT"]["START_DATE"])
local_end_date = pd.Timestamp(participant_file["FITBIT"]["END_DATE"]) + pd.DateOffset(1)
if column_format == "JSON": if column_format == "JSON":
json_raw = pd.read_csv(snakemake.input[0]) json_raw = pd.read_csv(snakemake.input["raw_data"])
parsed_data = parseHeartrateData(json_raw, fitbit_data_type) parsed_data = parseHeartrateData(json_raw, fitbit_data_type)
elif column_format == "PLAIN_TEXT": elif column_format == "PLAIN_TEXT":
parsed_data = pd.read_csv(snakemake.input[0], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None)) parsed_data = pd.read_csv(snakemake.input["raw_data"], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None))
else:
raise ValueError("column_format can only be one of ['JSON', 'PLAIN_TEXT'].")
# Only keep dates in the range of [local_start_date, local_end_date)
parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)]
if parsed_data.shape[0] > 0: if parsed_data.shape[0] > 0:
parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6 parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6

View File

@ -1,4 +1,4 @@
import json import json, yaml
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from datetime import datetime, timedelta from datetime import datetime, timedelta
@ -222,30 +222,42 @@ column_format = snakemake.params["column_format"]
fitbit_data_type = snakemake.params["fitbit_data_type"] fitbit_data_type = snakemake.params["fitbit_data_type"]
sleep_episode_timestamp = snakemake.params["sleep_episode_timestamp"] sleep_episode_timestamp = snakemake.params["sleep_episode_timestamp"]
with open(snakemake.input["participant_file"], "r", encoding="utf-8") as f:
participant_file = yaml.safe_load(f)
local_start_date = pd.Timestamp(participant_file["FITBIT"]["START_DATE"])
local_end_date = pd.Timestamp(participant_file["FITBIT"]["END_DATE"]) + pd.DateOffset(1)
if column_format == "JSON": if column_format == "JSON":
json_raw = pd.read_csv(snakemake.input[0]) json_raw = pd.read_csv(snakemake.input["raw_data"])
parsed_data = parseSleepData(json_raw, fitbit_data_type) parsed_data = parseSleepData(json_raw, fitbit_data_type)
elif column_format == "PLAIN_TEXT": elif column_format == "PLAIN_TEXT":
if fitbit_data_type == "summary": if fitbit_data_type == "summary":
parsed_data = pd.read_csv(snakemake.input[0], parse_dates=["local_start_date_time", "local_end_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None)) parsed_data = pd.read_csv(snakemake.input["raw_data"], parse_dates=["local_start_date_time", "local_end_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None))
elif fitbit_data_type == "intraday": elif fitbit_data_type == "intraday":
parsed_data = pd.read_csv(snakemake.input[1], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None)) parsed_data = pd.read_csv(snakemake.input["raw_data"], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None))
else: else:
raise ValueError("fitbit_data_type can only be one of ['summary', 'intraday'].") raise ValueError("fitbit_data_type can only be one of ['summary', 'intraday'].")
else: else:
raise ValueError("column_format can only be one of ['JSON', 'PLAIN_TEXT'].") raise ValueError("column_format can only be one of ['JSON', 'PLAIN_TEXT'].")
if parsed_data.shape[0] > 0 and fitbit_data_type == "summary": if parsed_data.shape[0] > 0 and fitbit_data_type == "summary":
if sleep_episode_timestamp == "start":
parsed_data["timestamp"] = parsed_data["local_start_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6 if sleep_episode_timestamp != "start" and sleep_episode_timestamp != "end":
elif sleep_episode_timestamp == "end":
parsed_data["timestamp"] = parsed_data["local_end_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6
else:
raise ValueError("SLEEP_EPISODE_TIMESTAMP can only be one of ['start', 'end'].") raise ValueError("SLEEP_EPISODE_TIMESTAMP can only be one of ['start', 'end'].")
# Column name to be considered as the event datetime
datetime_column = "local_" + sleep_episode_timestamp + "_date_time"
# Only keep dates in the range of [local_start_date, local_end_date)
parsed_data = parsed_data.loc[(parsed_data[datetime_column] >= local_start_date) & (parsed_data[datetime_column] < local_end_date)]
# Convert datetime to timestamp
parsed_data["timestamp"] = parsed_data[datetime_column].dt.tz_localize(timezone).astype(np.int64) // 10**6
# Drop useless columns: local_start_date_time and local_end_date_time # Drop useless columns: local_start_date_time and local_end_date_time
parsed_data.drop(["local_start_date_time", "local_end_date_time"], axis = 1, inplace=True) parsed_data.drop(["local_start_date_time", "local_end_date_time"], axis = 1, inplace=True)
if parsed_data.shape[0] > 0 and fitbit_data_type == "intraday": if parsed_data.shape[0] > 0 and fitbit_data_type == "intraday":
# Only keep dates in the range of [local_start_date, local_end_date)
parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)]
# Convert datetime to timestamp
parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6 parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6
# Unifying level # Unifying level
parsed_data["unified_level"] = np.where(parsed_data["level"].isin(["awake", "wake", "restless"]), 0, 1) parsed_data["unified_level"] = np.where(parsed_data["level"].isin(["awake", "wake", "restless"]), 0, 1)

View File

@ -1,4 +1,4 @@
import json import json, yaml
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from datetime import datetime, timezone from datetime import datetime, timezone
@ -57,14 +57,22 @@ timezone = snakemake.params["timezone"]
column_format = snakemake.params["column_format"] column_format = snakemake.params["column_format"]
fitbit_data_type = snakemake.params["fitbit_data_type"] fitbit_data_type = snakemake.params["fitbit_data_type"]
with open(snakemake.input["participant_file"], "r", encoding="utf-8") as f:
participant_file = yaml.safe_load(f)
local_start_date = pd.Timestamp(participant_file["FITBIT"]["START_DATE"])
local_end_date = pd.Timestamp(participant_file["FITBIT"]["END_DATE"]) + pd.DateOffset(1)
if column_format == "JSON": if column_format == "JSON":
json_raw = pd.read_csv(snakemake.input[0]) json_raw = pd.read_csv(snakemake.input["raw_data"])
parsed_data = parseStepsData(json_raw, fitbit_data_type) parsed_data = parseStepsData(json_raw, fitbit_data_type)
elif column_format == "PLAIN_TEXT": elif column_format == "PLAIN_TEXT":
parsed_data = pd.read_csv(snakemake.input[0], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None)) parsed_data = pd.read_csv(snakemake.input["raw_data"], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None))
else: else:
raise ValueError("column_format can only be one of ['JSON', 'PLAIN_TEXT'].") raise ValueError("column_format can only be one of ['JSON', 'PLAIN_TEXT'].")
# Only keep dates in the range of [local_start_date, local_end_date)
parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)]
if parsed_data.shape[0] > 0: if parsed_data.shape[0] > 0:
parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6 parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6