diff --git a/rules/preprocessing.snakefile b/rules/preprocessing.snakefile index 43f23fdc..4b1463b0 100644 --- a/rules/preprocessing.snakefile +++ b/rules/preprocessing.snakefile @@ -16,7 +16,7 @@ rule readable_datetime: timezones = None, fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"] wildcard_constraints: - sensor = "^fitbit.*" # ignoring fitbit sensors + sensor = '(' + '|'.join([re.escape(x) for x in config["SENSORS"]]) + ')' # only process smartphone sensors, not fitbit output: "data/raw/{pid}/{sensor}_with_datetime.csv" script: @@ -82,33 +82,14 @@ rule application_genres: script: "../src/data/application_genres.R" -rule fitbit_heartrate_with_datetime: +rule fitbit_with_datetime: input: "data/raw/{pid}/fitbit_data_raw.csv" params: local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"], + fitbit_sensor = "{fitbit_sensor}" output: - "data/raw/{pid}/fitbit_heartrate_with_datetime.csv" + "data/raw/{pid}/fitbit_{fitbit_sensor}_with_datetime.csv" script: - "../src/data/fitbit_heartrate_with_datetime.py" - -rule fitbit_steps_with_datetime: - input: - "data/raw/{pid}/fitbit_data_raw.csv" - params: - local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"] - output: - "data/raw/{pid}/fitbit_steps_with_datetime.csv" - script: - "../src/data/fitbit_steps_with_datetime.py" - -rule fitbit_sleep_with_datetime: - input: - "data/raw/{pid}/fitbit_data_raw.csv" - params: - local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"] - output: - "data/raw/{pid}/fitbit_sleep_with_datetime.csv" - script: - "../src/data/fitbit_sleep_with_datetime.py" + "../src/data/fitbit_readable_datetime.py" diff --git a/src/data/fitbit_heartrate_with_datetime.py b/src/data/fitbit_heartrate_with_datetime.py deleted file mode 100644 index 18994d51..00000000 --- a/src/data/fitbit_heartrate_with_datetime.py +++ /dev/null @@ -1,71 +0,0 @@ -import pandas as pd -import pytz, json -from datetime import datetime - - -NIGHT = "night" -MORNING = "morning" -AFTERNOON = "afternoon" -EVENING = "evening" -HOUR2EPOCH = [NIGHT] * 6 + [MORNING] * 6 + [AFTERNOON] * 6 + [EVENING] * 6 - - -HR_COLUMNS = ("device_id", - "heartrate", - "local_date_time", - "local_date", - "local_month", - "local_day", - "local_day_of_week", - "local_time", - "local_hour", - "local_minute", - "local_day_segment") - -fitbit_data = pd.read_csv(snakemake.input[0]) -heartrate_data = fitbit_data[fitbit_data["fitbit_data_type"] == "heartrate"] - -local_timezone = pytz.timezone(snakemake.params["local_timezone"]) - - -""" -Data is pulled in intraday manner. Since data will be duplicated until the -last record from that day, first sort by time, then drop all but -the last record for each day. Drop duplicates based on aware timestamp. -""" -local_date_col = heartrate_data["timestamp"].apply(lambda ts: str(datetime.fromtimestamp(ts/1000, tz=local_timezone).date())) -heartrate_data = heartrate_data.assign(local_date=local_date_col.values) -heartrate_data.sort_values(by="timestamp", ascending=True, inplace=True) -heartrate_data.drop_duplicates(subset="local_date", keep="last", inplace=True) - -device_id = heartrate_data["device_id"].iloc[0] -records = [] -# Parse JSON into individual records -for record in heartrate_data.fitbit_data: - record = json.loads(record) # Parse text into JSON - curr_date = datetime.strptime(record["activities-heart"][0]["dateTime"], "%Y-%m-%d") - dataset = record["activities-heart-intraday"]["dataset"] - for data in dataset: - d_time = datetime.strptime(data["time"], '%H:%M:%S').time() - d_datetime = datetime.combine(curr_date, d_time) - - # Create tuple of parsed data - row = (device_id, - data["value"], - d_datetime, - d_datetime.date(), - d_datetime.month, - d_datetime.day, - d_datetime.weekday(), - d_datetime.time(), - d_datetime.hour, - d_datetime.minute, - HOUR2EPOCH[d_datetime.hour]) - - # Append the data to a list - records.append(row) - -# Create a new DataFrame from the list of tuples. -heartrate_preprocessed = pd.DataFrame(data=records, columns=HR_COLUMNS) - -heartrate_preprocessed.to_csv(snakemake.output[0], index=False) diff --git a/src/data/fitbit_readable_datetime.py b/src/data/fitbit_readable_datetime.py new file mode 100644 index 00000000..ad09ebb8 --- /dev/null +++ b/src/data/fitbit_readable_datetime.py @@ -0,0 +1,150 @@ +import pandas as pd +import pytz, json +from datetime import datetime + + +NIGHT = "night" +MORNING = "morning" +AFTERNOON = "afternoon" +EVENING = "evening" +HOUR2EPOCH = [NIGHT] * 6 + [MORNING] * 6 + [AFTERNOON] * 6 + [EVENING] * 6 + + +HR_COLUMNS = ("device_id", + "heartrate", + "local_date_time", "local_date", "local_month", "local_day", + "local_day_of_week", "local_time", "local_hour", "local_minute", + "local_day_segment") + +SLEEP_COLUMNS = ("device_id", + "sleep", # 1: "asleep", 2: "restless", or 3: "awake" + "local_date_time", "local_date", "local_month", "local_day", + "local_day_of_week", "local_time", "local_hour", "local_minute", + "local_day_segment") + +STEPS_COLUMNS = ("device_id", + "steps", + "local_date_time", "local_date", "local_month", "local_day", + "local_day_of_week", "local_time", "local_hour", "local_minute", + "local_day_segment") + +def drop_duplicates(data, local_timezone): + """ + Data is pulled in intraday manner. Since data will be duplicated until the + last record from that day, first sort by time, then drop all but + the last record for each day. Drop duplicates based on aware timestamp. + """ + local_date_col = data["timestamp"].apply(lambda ts: str(datetime.fromtimestamp(ts/1000, tz=local_timezone).date())) + data = data.assign(local_date=local_date_col.values) + data.sort_values(by="timestamp", ascending=True, inplace=True) + data.drop_duplicates(subset="local_date", keep="last", inplace=True) + + return data + +def parse_steps_data(steps_data): + device_id = steps_data["device_id"].iloc[0] + records = [] + # Parse JSON into individual records + for record in steps_data.fitbit_data: + record = json.loads(record) # Parse text into JSON + curr_date = datetime.strptime( + record["activities-steps"][0]["dateTime"], "%Y-%m-%d") + dataset = record["activities-steps-intraday"]["dataset"] + for data in dataset: + d_time = datetime.strptime(data["time"], '%H:%M:%S').time() + d_datetime = datetime.combine(curr_date, d_time) + + row = (device_id, + data["value"], + d_datetime, + d_datetime.date(), + d_datetime.month, + d_datetime.day, + d_datetime.weekday(), + d_datetime.time(), + d_datetime.hour, + d_datetime.minute, + HOUR2EPOCH[d_datetime.hour]) + + records.append(row) + + return pd.DataFrame(data=records, columns=STEPS_COLUMNS) + +def parse_sleep_data(sleep_data): + device_id = sleep_data["device_id"].iloc[0] + records = [] + # Parse JSON into individual records + for multi_record in sleep_data.fitbit_data: + for record in json.loads(multi_record)["sleep"]: + + # Compute date when sleep episodes span two days + start_date = datetime.strptime(record["startTime"][:10], "%Y-%m-%d") + end_date = datetime.strptime(record["endTime"][:10], "%Y-%m-%d") + flag = 1 if start_date == end_date else 0 + for data in record["minuteData"]: + d_time = datetime.strptime(data["dateTime"], '%H:%M:%S').time() + if not flag and not d_time.hour: + flag = 1 + curr_date = end_date if flag else start_date + d_datetime = datetime.combine(curr_date, d_time) + + row = (device_id, + data["value"], + d_datetime, + d_datetime.date(), + d_datetime.month, + d_datetime.day, + d_datetime.weekday(), + d_datetime.time(), + d_datetime.hour, + d_datetime.minute, + HOUR2EPOCH[d_datetime.hour]) + + records.append(row) + + return pd.DataFrame(data=records, columns=SLEEP_COLUMNS) + +def parse_heartrate_data(heartrate_data): + device_id = heartrate_data["device_id"].iloc[0] + records = [] + # Parse JSON into individual records + for record in heartrate_data.fitbit_data: + record = json.loads(record) # Parse text into JSON + curr_date = datetime.strptime(record["activities-heart"][0]["dateTime"], "%Y-%m-%d") + dataset = record["activities-heart-intraday"]["dataset"] + for data in dataset: + d_time = datetime.strptime(data["time"], '%H:%M:%S').time() + d_datetime = datetime.combine(curr_date, d_time) + + row = (device_id, + data["value"], + d_datetime, + d_datetime.date(), + d_datetime.month, + d_datetime.day, + d_datetime.weekday(), + d_datetime.time(), + d_datetime.hour, + d_datetime.minute, + HOUR2EPOCH[d_datetime.hour]) + + records.append(row) + + return pd.DataFrame(data=records, columns=HR_COLUMNS) + + +fitbit_data = pd.read_csv(snakemake.input[0]) +local_timezone = pytz.timezone(snakemake.params["local_timezone"]) +sensor = snakemake.params["fitbit_sensor"] + +data = fitbit_data[fitbit_data["fitbit_data_type"] == sensor] +data = drop_duplicates(data, local_timezone) + +if sensor == "heartrate": + data_preprocesed = parse_heartrate_data(data) +elif sensor == "sleep": + data_preprocesed = parse_sleep_data(data) +elif sensor == "steps": + data_preprocesed = parse_steps_data(data) + +data_preprocesed.to_csv(snakemake.output[0], index=False) diff --git a/src/data/fitbit_sleep_with_datetime.py b/src/data/fitbit_sleep_with_datetime.py deleted file mode 100644 index 213f00fa..00000000 --- a/src/data/fitbit_sleep_with_datetime.py +++ /dev/null @@ -1,76 +0,0 @@ -import pandas as pd -import pytz, json -from datetime import datetime - - - -NIGHT = "night" -MORNING = "morning" -AFTERNOON = "afternoon" -EVENING = "evening" -HOUR2EPOCH = [NIGHT] * 6 + [MORNING] * 6 + [AFTERNOON] * 6 + [EVENING] * 6 - - -SLEEP_COLUMNS = ("device_id", - "sleep", # 1: "asleep", 2: "restless", or 3: "awake" - "local_date_time", - "local_date", - "local_month", - "local_day", - "local_day_of_week", - "local_time", - "local_hour", - "local_minute", - "local_day_segment") - -fitbit_data = pd.read_csv(snakemake.input[0]) -sleep_data = fitbit_data[fitbit_data["fitbit_data_type"] == "sleep"] - -local_timezone = pytz.timezone(snakemake.params["local_timezone"]) - - -""" -Data is pulled in intraday manner. Since data will be duplicated until the -last record from that day, first sort by time, then drop all but -the last record for each day. Drop duplicates based on aware timestamp. -""" -local_date_col = sleep_data["timestamp"].apply(lambda ts: str(datetime.fromtimestamp(ts/1000, tz=local_timezone).date())) -sleep_data = sleep_data.assign(local_date=local_date_col.values) -sleep_data.sort_values(by="timestamp", ascending=True, inplace=True) -sleep_data.drop_duplicates(subset="local_date", keep="last", inplace=True) - -device_id = sleep_data["device_id"].iloc[0] -records = [] -# Parse JSON into individual records -for multi_record in sleep_data.fitbit_data: - for record in json.loads(multi_record)["sleep"]: - start_date = datetime.strptime(record["startTime"][:10], "%Y-%m-%d") - end_date = datetime.strptime(record["endTime"][:10], "%Y-%m-%d") - flag = 1 if start_date == end_date else 0 - for data in record["minuteData"]: - d_time = datetime.strptime(data["dateTime"], '%H:%M:%S').time() - if not flag and not d_time.hour: - flag = 1 - curr_date = end_date if flag else start_date - d_datetime = datetime.combine(curr_date, d_time) - - # Create tuple of parsed data - row = (device_id, - data["value"], - d_datetime, - d_datetime.date(), - d_datetime.month, - d_datetime.day, - d_datetime.weekday(), - d_datetime.time(), - d_datetime.hour, - d_datetime.minute, - HOUR2EPOCH[d_datetime.hour]) - - # Append the data to a list - records.append(row) - -# Create a new DataFrame from the list of tuples. -sleep_preprocessed = pd.DataFrame(data=records, columns=SLEEP_COLUMNS) - -sleep_preprocessed.to_csv(snakemake.output[0], index=False) diff --git a/src/data/fitbit_steps_with_datetime.py b/src/data/fitbit_steps_with_datetime.py deleted file mode 100644 index e741c4a2..00000000 --- a/src/data/fitbit_steps_with_datetime.py +++ /dev/null @@ -1,72 +0,0 @@ -import pandas as pd -import pytz, json -from datetime import datetime - - -NIGHT = "night" -MORNING = "morning" -AFTERNOON = "afternoon" -EVENING = "evening" -HOUR2EPOCH = [NIGHT] * 6 + [MORNING] * 6 + [AFTERNOON] * 6 + [EVENING] * 6 - - -STEPS_COLUMNS = ("device_id", - "steps", - "local_date_time", - "local_date", - "local_month", - "local_day", - "local_day_of_week", - "local_time", - "local_hour", - "local_minute", - "local_day_segment") - -fitbit_data = pd.read_csv(snakemake.input[0]) -steps_data = fitbit_data[fitbit_data["fitbit_data_type"] == "steps"] - -local_timezone = pytz.timezone(snakemake.params["local_timezone"]) - - -""" -Data is pulled in intraday manner. Since data will be duplicated until the -last record from that day, first sort by time, then drop all but -the last record for each day. Drop duplicates based on aware timestamp. -""" -local_date_col = steps_data["timestamp"].apply(lambda ts: str(datetime.fromtimestamp(ts/1000, tz=local_timezone).date())) -steps_data = steps_data.assign(local_date=local_date_col.values) -steps_data.sort_values(by="timestamp", ascending=True, inplace=True) -steps_data.drop_duplicates(subset="local_date", keep="last", inplace=True) - -device_id = steps_data["device_id"].iloc[0] -records = [] -# Parse JSON into individual records -for record in steps_data.fitbit_data: - record = json.loads(record) # Parse text into JSON - curr_date = datetime.strptime( - record["activities-steps"][0]["dateTime"], "%Y-%m-%d") - dataset = record["activities-steps-intraday"]["dataset"] - for data in dataset: - d_time = datetime.strptime(data["time"], '%H:%M:%S').time() - d_datetime = datetime.combine(curr_date, d_time) - - # Create tuple of parsed data - row = (device_id, - data["value"], - d_datetime, - d_datetime.date(), - d_datetime.month, - d_datetime.day, - d_datetime.weekday(), - d_datetime.time(), - d_datetime.hour, - d_datetime.minute, - HOUR2EPOCH[d_datetime.hour]) - - # Append the data to a list - records.append(row) - -# Create a new DataFrame from the list of tuples. -steps_preprocessed = pd.DataFrame(data=records, columns=STEPS_COLUMNS) - -steps_preprocessed.to_csv(snakemake.output[0], index=False)