From deba6b9e4faf4a473172d10566ca9b41df73490a Mon Sep 17 00:00:00 2001 From: Meng Li <34143965+Meng6@users.noreply.github.com> Date: Mon, 23 Nov 2020 12:01:00 -0500 Subject: [PATCH] Update sleep summary features for segments --- Snakefile | 25 +-- config.yaml | 14 +- rules/features.smk | 26 +++ rules/preprocessing.smk | 27 +-- src/data/fitbit_parse_sleep.py | 155 ++++++++++-------- .../fitbit_sleep/fitbit_sleep_base.py | 70 -------- src/features/fitbit_sleep_features.py | 18 -- .../fitbit_sleep_summary/rapids/main.py | 91 ++++++++++ src/features/fitbit_step_features.py | 66 -------- 9 files changed, 241 insertions(+), 251 deletions(-) delete mode 100644 src/features/fitbit_sleep/fitbit_sleep_base.py delete mode 100644 src/features/fitbit_sleep_features.py create mode 100644 src/features/fitbit_sleep_summary/rapids/main.py delete mode 100644 src/features/fitbit_step_features.py diff --git a/Snakefile b/Snakefile index ab7ed4eb..a97d1b78 100644 --- a/Snakefile +++ b/Snakefile @@ -147,10 +147,6 @@ for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys(): if config["FITBIT_CALORIES"]["TABLE_FORMAT"] not in ["JSON", "CSV"]: raise ValueError("config['FITBIT_CALORIES']['TABLE_FORMAT'] should be JSON or CSV but you typed" + config["FITBIT_CALORIES"]["TABLE_FORMAT"]) -if config["FITBIT_SLEEP"]["TABLE_FORMAT"] not in ["JSON", "CSV"]: - raise ValueError("config['FITBIT_SLEEP']['TABLE_FORMAT'] should be JSON or CSV but you typed" + config["FITBIT_SLEEP"]["TABLE_FORMAT"]) - - for provider in config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"].keys(): if config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"][provider]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_summary_raw.csv", pid=config["PIDS"])) @@ -167,6 +163,20 @@ for provider in config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/interim/{pid}/fitbit_heartrate_intraday_features/fitbit_heartrate_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_heartrate_intraday.csv", pid=config["PIDS"])) +for provider in config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"].keys(): + if config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_summary_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_summary_parsed.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_summary_parsed_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_summary_features/fitbit_sleep_summary_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_sleep_summary.csv", pid=config["PIDS"])) + +# for provider in config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"].keys(): +# if config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]: +# files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_intraday_raw.csv", pid=config["PIDS"])) +# files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_intraday_parsed.csv", pid=config["PIDS"])) +# files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_intraday_parsed_with_datetime.csv", pid=config["PIDS"])) + for provider in config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"].keys(): if config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][provider]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_summary_raw.csv", pid=config["PIDS"])) @@ -189,13 +199,6 @@ for provider in config["FITBIT_CALORIES"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/raw/{pid}/fitbit_calories_{fitbit_data_type}_parsed.csv", pid=config["PIDS"], fitbit_data_type=["summary", "intraday"])) files_to_compute.extend(expand("data/raw/{pid}/fitbit_calories_{fitbit_data_type}_parsed_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["summary", "intraday"])) -for provider in config["FITBIT_SLEEP"]["PROVIDERS"].keys(): - if config["FITBIT_SLEEP"]["PROVIDERS"][provider]["COMPUTE"]: - files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_{fitbit_data_type}_raw.csv", pid=config["PIDS"], fitbit_data_type=(["json"] if config["FITBIT_SLEEP"]["TABLE_FORMAT"] == "JSON" else ["summary", "intraday"]))) - files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_{fitbit_data_type}_parsed_episodes.csv", pid=config["PIDS"], fitbit_data_type=["summary"])) - files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_{fitbit_data_type}_parsed.csv", pid=config["PIDS"], fitbit_data_type=["intraday"])) - files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_{fitbit_data_type}_parsed_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["intraday"])) - # visualization for data exploration if config["HEATMAP_FEATURES_CORRELATIONS"]["PLOT"]: files_to_compute.extend(expand("reports/data_exploration/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/heatmap_features_correlations.html", min_valid_hours_per_day=config["HEATMAP_FEATURES_CORRELATIONS"]["MIN_VALID_HOURS_PER_DAY"], min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"])) diff --git a/config.yaml b/config.yaml index 94d71942..e91f7527 100644 --- a/config.yaml +++ b/config.yaml @@ -300,18 +300,16 @@ FITBIT_STEPS_INTRADAY: SRC_FOLDER: "rapids" # inside src/features/fitbit_steps_intraday SRC_LANGUAGE: "python" -FITBIT_SLEEP: - TABLE_FORMAT: JSON # JSON or CSV. If your JSON or CSV data are files change [DEVICE_DATA][FITBIT][SOURCE][TYPE] to FILES - TABLE: - JSON: fitbit_sleep - CSV: - SUMMARY: sleep_summary - INTRADAY: sleep_intraday +FITBIT_SLEEP_SUMMARY: + TABLE: sleep_summary + SLEEP_EPISODE_TIMESTAMP: end # summary sleep episodes are considered as events based on either the start timestamp or end timestamp. PROVIDERS: RAPIDS: COMPUTE: False + FEATURES: ["countepisode", "avgefficiency", "sumdurationafterwakeup", "sumdurationasleep", "sumdurationawake", "sumdurationtofallasleep", "sumdurationinbed", "avgdurationafterwakeup", "avgdurationasleep", "avgdurationawake", "avgdurationtofallasleep", "avgdurationinbed"] SLEEP_TYPES: ["main", "nap", "all"] - SUMMARY_FEATURES: ["sumdurationafterwakeup", "sumdurationasleep", "sumdurationawake", "sumdurationtofallasleep", "sumdurationinbed", "avgefficiency", "countepisode"] + SRC_FOLDER: "rapids" # inside src/features/fitbit_sleep_summary + SRC_LANGUAGE: "python" FITBIT_CALORIES: TABLE_FORMAT: JSON # JSON or CSV. If your JSON or CSV data are files change [DEVICE_DATA][FITBIT][SOURCE][TYPE] to FILES diff --git a/rules/features.smk b/rules/features.smk index 8348a386..d4a103ae 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -476,6 +476,32 @@ rule fitbit_steps_intraday_r_features: script: "../src/features/entry.R" +rule fitbit_sleep_summary_python_features: + input: + sensor_data = "data/raw/{pid}/fitbit_sleep_summary_parsed_with_datetime.csv", + day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + params: + provider = lambda wildcards: config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"][wildcards.provider_key.upper()], + provider_key = "{provider_key}", + sensor_key = "fitbit_sleep_summary" + output: + "data/interim/{pid}/fitbit_sleep_summary_features/fitbit_sleep_summary_python_{provider_key}.csv" + script: + "../src/features/entry.py" + +rule fitbit_sleep_summary_r_features: + input: + sensor_data = "data/raw/{pid}/fitbit_sleep_summary_parsed_with_datetime.csv", + day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + params: + provider = lambda wildcards: config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"][wildcards.provider_key.upper()], + provider_key = "{provider_key}", + sensor_key = "fitbit_sleep_summary" + output: + "data/interim/{pid}/fitbit_sleep_summary_features/fitbit_sleep_summary_r_{provider_key}.csv" + script: + "../src/features/entry.R" + # rule fitbit_sleep_features: # input: # sleep_summary_data = "data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv", diff --git a/rules/preprocessing.smk b/rules/preprocessing.smk index 02d9be7e..bc6e8634 100644 --- a/rules/preprocessing.smk +++ b/rules/preprocessing.smk @@ -206,6 +206,20 @@ rule fitbit_parse_steps: script: "../src/data/fitbit_parse_steps.py" +rule fitbit_parse_sleep: + input: + "data/raw/{pid}/fitbit_sleep_{fitbit_data_type}_raw.csv" + params: + timezone = config["DEVICE_DATA"]["PHONE"]["TIMEZONE"]["VALUE"], + table = lambda wildcards: config["FITBIT_SLEEP_"+str(wildcards.fitbit_data_type).upper()]["TABLE"], + column_format = config["DEVICE_DATA"]["FITBIT"]["SOURCE"]["COLUMN_FORMAT"], + fitbit_data_type = "{fitbit_data_type}", + sleep_episode_timestamp = config["FITBIT_SLEEP_SUMMARY"]["SLEEP_EPISODE_TIMESTAMP"] + output: + "data/raw/{pid}/fitbit_sleep_{fitbit_data_type}_parsed.csv" + script: + "../src/data/fitbit_parse_sleep.py" + rule fitbit_parse_calories: input: data = expand("data/raw/{{pid}}/fitbit_calories_{fitbit_data_type}_raw.csv", fitbit_data_type = (["json"] if config["FITBIT_CALORIES"]["TABLE_FORMAT"] == "JSON" else ["summary", "intraday"])) @@ -219,19 +233,6 @@ rule fitbit_parse_calories: script: "../src/data/fitbit_parse_calories.py" -rule fitbit_parse_sleep: - input: - data = expand("data/raw/{{pid}}/fitbit_sleep_{fitbit_data_type}_raw.csv", fitbit_data_type = (["json"] if config["FITBIT_SLEEP"]["TABLE_FORMAT"] == "JSON" else ["summary", "intraday"])) - params: - timezone = config["DEVICE_DATA"]["PHONE"]["TIMEZONE"]["VALUE"], - table = config["FITBIT_SLEEP"]["TABLE"], - table_format = config["FITBIT_SLEEP"]["TABLE_FORMAT"] - output: - summary_data = "data/raw/{pid}/fitbit_sleep_summary_parsed_episodes.csv", - intraday_data = "data/raw/{pid}/fitbit_sleep_intraday_parsed.csv" - script: - "../src/data/fitbit_parse_sleep.py" - rule fitbit_readable_datetime: input: sensor_input = "data/raw/{pid}/fitbit_{sensor}_{fitbit_data_type}_parsed.csv", diff --git a/src/data/fitbit_parse_sleep.py b/src/data/fitbit_parse_sleep.py index 03278fec..6c673f95 100644 --- a/src/data/fitbit_parse_sleep.py +++ b/src/data/fitbit_parse_sleep.py @@ -1,9 +1,8 @@ import json import pandas as pd -from datetime import datetime import numpy as np +from datetime import datetime, timedelta import dateutil.parser -from datetime import timedelta SLEEP_CODE2LEVEL = ["asleep", "restless", "awake"] @@ -12,7 +11,7 @@ SLEEP_SUMMARY_COLUMNS_V1_2 = ("device_id", "efficiency", "minutes_after_wakeup", "minutes_asleep", "minutes_awake", "minutes_to_fall_asleep", "minutes_in_bed", "is_main_sleep", "type", "local_start_date_time", "local_end_date_time", - "start_timestamp", "end_timestamp") + "timestamp") SLEEP_SUMMARY_COLUMNS_V1 = SLEEP_SUMMARY_COLUMNS_V1_2 + ("count_awake", "duration_awake", "count_awakenings", "count_restless", "duration_restless") SLEEP_INTRADAY_COLUMNS = ("device_id", @@ -71,71 +70,75 @@ def classicData1min(data_summary): newRow = {'dateTime':dateutil.parser.parse(origEntry['dateTime'])+timedelta(seconds=counter*timeDuration),'level':origEntry['level'],'seconds':timeDuration} dataList.append(newRow) counter = counter + 1 - # print(dataList) return dataList -# Parse one record for sleep API version 1 -def parseOneRecordForV1(record, device_id, d_is_main_sleep, records_summary, records_intraday): - # Summary data +# Parse one record for sleep API version 1 +def parseOneRecordForV1(record, device_id, d_is_main_sleep, records_summary, records_intraday, fitbit_data_type): + sleep_record_type = "classic" d_start_datetime = datetime.strptime(record["startTime"][:18], "%Y-%m-%dT%H:%M:%S") d_end_datetime = datetime.strptime(record["endTime"][:18], "%Y-%m-%dT%H:%M:%S") - row_summary = (device_id, record["efficiency"], - record["minutesAfterWakeup"], record["minutesAsleep"], record["minutesAwake"], record["minutesToFallAsleep"], record["timeInBed"], - d_is_main_sleep, sleep_record_type, - d_start_datetime, d_end_datetime, - d_start_datetime.date(), d_end_datetime.date(), - 0,0, - record["awakeCount"], record["awakeDuration"], record["awakeningsCount"], - record["restlessCount"], record["restlessDuration"]) - - records_summary.append(row_summary) + # Summary data + if fitbit_data_type == "summary": + row_summary = (device_id, record["efficiency"], + record["minutesAfterWakeup"], record["minutesAsleep"], record["minutesAwake"], record["minutesToFallAsleep"], record["timeInBed"], + d_is_main_sleep, sleep_record_type, + d_start_datetime, d_end_datetime, + 0, + record["awakeCount"], record["awakeDuration"], record["awakeningsCount"], + record["restlessCount"], record["restlessDuration"]) + + records_summary.append(row_summary) # Intraday data - start_date = d_start_datetime.date() - end_date = d_end_datetime.date() - is_before_midnight = True - curr_date = start_date - for data in record["minuteData"]: - # For overnight episodes, use end_date once we are over midnight - d_time = datetime.strptime(data["dateTime"], '%H:%M:%S').time() - if is_before_midnight and d_time.hour == 0: - curr_date = end_date - d_datetime = datetime.combine(curr_date, d_time) + if fitbit_data_type == "intraday": + start_date = d_start_datetime.date() + end_date = d_end_datetime.date() + is_before_midnight = True + curr_date = start_date + for data in record["minuteData"]: + # For overnight episodes, use end_date once we are over midnight + d_time = datetime.strptime(data["dateTime"], '%H:%M:%S').time() + if is_before_midnight and d_time.hour == 0: + curr_date = end_date + d_datetime = datetime.combine(curr_date, d_time) - # API 1.2 stores original_level as strings, so we convert original_levels of API 1 to strings too - # (1: "asleep", 2: "restless", 3: "awake") - d_original_level = SLEEP_CODE2LEVEL[int(data["value"])-1] + # API 1.2 stores original_level as strings, so we convert original_levels of API 1 to strings too + # (1: "asleep", 2: "restless", 3: "awake") + d_original_level = SLEEP_CODE2LEVEL[int(data["value"])-1] - row_intraday = (device_id, - d_original_level, -1, d_is_main_sleep, sleep_record_type, - d_datetime, 0) + row_intraday = (device_id, + d_original_level, -1, d_is_main_sleep, sleep_record_type, + d_datetime, 0) - records_intraday.append(row_intraday) + records_intraday.append(row_intraday) return records_summary, records_intraday # Parse one record for sleep API version 1.2 -def parseOneRecordForV12(record, device_id, d_is_main_sleep, records_summary, records_intraday): +def parseOneRecordForV12(record, device_id, d_is_main_sleep, records_summary, records_intraday, fitbit_data_type): - # Summary data sleep_record_type = record['type'] d_start_datetime = datetime.strptime(record["startTime"][:18], "%Y-%m-%dT%H:%M:%S") d_end_datetime = datetime.strptime(record["endTime"][:18], "%Y-%m-%dT%H:%M:%S") - row_summary = (device_id, record["efficiency"], - record["minutesAfterWakeup"], record["minutesAsleep"], record["minutesAwake"], record["minutesToFallAsleep"], record["timeInBed"], - d_is_main_sleep, sleep_record_type, - d_start_datetime, d_end_datetime, - 0,0) + # Summary data + if fitbit_data_type == "summary": + row_summary = (device_id, record["efficiency"], + record["minutesAfterWakeup"], record["minutesAsleep"], record["minutesAwake"], record["minutesToFallAsleep"], record["timeInBed"], + d_is_main_sleep, sleep_record_type, + d_start_datetime, d_end_datetime, + 0,0) + + records_summary.append(row_summary) - records_summary.append(row_summary) - if sleep_record_type == 'classic': - # Intraday data + # Intraday data + if fitbit_data_type == "intraday": + if sleep_record_type == 'classic': start_date = d_start_datetime.date() end_date = d_end_datetime.date() is_before_midnight = True @@ -155,8 +158,8 @@ def parseOneRecordForV12(record, device_id, d_is_main_sleep, records_summary, re d_original_level, -1, d_is_main_sleep, sleep_record_type, d_datetime, 0) records_intraday.append(row_intraday) - else: - ## for sleep type "stages" + else: + # For sleep type "stages" start_date = d_start_datetime.date() end_date = d_end_datetime.date() is_before_midnight = True @@ -182,7 +185,7 @@ def parseOneRecordForV12(record, device_id, d_is_main_sleep, records_summary, re -def parseSleepData(sleep_data): +def parseSleepData(sleep_data, fitbit_data_type): SLEEP_SUMMARY_COLUMNS = SLEEP_SUMMARY_COLUMNS_V1_2 if sleep_data.empty: return pd.DataFrame(columns=SLEEP_SUMMARY_COLUMNS), pd.DataFrame(columns=SLEEP_INTRADAY_COLUMNS) @@ -197,32 +200,54 @@ def parseSleepData(sleep_data): # For sleep API version 1 if "awakeCount" in record: SLEEP_SUMMARY_COLUMNS = SLEEP_SUMMARY_COLUMNS_V1 - records_summary, records_intraday = parseOneRecordForV1(record, device_id, d_is_main_sleep, records_summary, records_intraday) + records_summary, records_intraday = parseOneRecordForV1(record, device_id, d_is_main_sleep, records_summary, records_intraday, fitbit_data_type) # For sleep API version 1.2 else: SLEEP_SUMMARY_COLUMNS = SLEEP_SUMMARY_COLUMNS_V1_2 - records_summary, records_intraday = parseOneRecordForV12(record, device_id, d_is_main_sleep, records_summary, records_intraday) + records_summary, records_intraday = parseOneRecordForV12(record, device_id, d_is_main_sleep, records_summary, records_intraday, fitbit_data_type) + + if fitbit_data_type == "summary": + parsed_data = pd.DataFrame(data=records_summary, columns=SLEEP_SUMMARY_COLUMNS) + elif fitbit_data_type == "intraday": + parsed_data = pd.DataFrame(data=records_intraday, columns=SLEEP_INTRADAY_COLUMNS) + else: + raise ValueError("fitbit_data_type can only be one of ['summary', 'intraday'].") + + return parsed_data + - return pd.DataFrame(data=records_summary, columns=SLEEP_SUMMARY_COLUMNS), pd.DataFrame(data=records_intraday, columns=SLEEP_INTRADAY_COLUMNS) -table_format = snakemake.params["table_format"] timezone = snakemake.params["timezone"] +column_format = snakemake.params["column_format"] +fitbit_data_type = snakemake.params["fitbit_data_type"] +sleep_episode_timestamp = snakemake.params["sleep_episode_timestamp"] -if table_format == "JSON": +if column_format == "JSON": json_raw = pd.read_csv(snakemake.input[0]) - summary, intraday = parseSleepData(json_raw) -elif table_format == "CSV": - summary = pd.read_csv(snakemake.input[0], parse_dates=["local_start_date_time", "local_end_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None)) - intraday = pd.read_csv(snakemake.input[1], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None)) + parsed_data = parseSleepData(json_raw, fitbit_data_type) +elif column_format == "PLAIN_TEXT": + if fitbit_data_type == "summary": + parsed_data = pd.read_csv(snakemake.input[0], parse_dates=["local_start_date_time", "local_end_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None)) + elif fitbit_data_type == "intraday": + parsed_data = pd.read_csv(snakemake.input[1], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None)) + else: + raise ValueError("fitbit_data_type can only be one of ['summary', 'intraday'].") +else: + raise ValueError("column_format can only be one of ['JSON', 'PLAIN_TEXT'].") -if summary.shape[0] > 0: - summary["start_timestamp"] = summary["local_start_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6 - summary["end_timestamp"] = summary["local_end_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6 -if intraday.shape[0] > 0: - intraday["timestamp"] = intraday["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6 +if parsed_data.shape[0] > 0 and fitbit_data_type == "summary": + if sleep_episode_timestamp == "start": + parsed_data["timestamp"] = parsed_data["local_start_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6 + elif sleep_episode_timestamp == "end": + parsed_data["timestamp"] = parsed_data["local_end_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6 + else: + raise ValueError("SLEEP_EPISODE_TIMESTAMP can only be one of ['start', 'end'].") + # Drop useless columns: local_start_date_time and local_end_date_time + parsed_data.drop(["local_start_date_time", "local_end_date_time"], axis = 1, inplace=True) -# Unifying level -intraday["unified_level"] = np.where(intraday["level"].isin(["awake", "wake", "restless"]), 0, 1) +if parsed_data.shape[0] > 0 and fitbit_data_type == "intraday": + parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6 + # Unifying level + parsed_data["unified_level"] = np.where(parsed_data["level"].isin(["awake", "wake", "restless"]), 0, 1) -summary.to_csv(snakemake.output["summary_data"], index=False) -intraday.to_csv(snakemake.output["intraday_data"], index=False) \ No newline at end of file +parsed_data.to_csv(snakemake.output[0], index=False) \ No newline at end of file diff --git a/src/features/fitbit_sleep/fitbit_sleep_base.py b/src/features/fitbit_sleep/fitbit_sleep_base.py deleted file mode 100644 index d4eca912..00000000 --- a/src/features/fitbit_sleep/fitbit_sleep_base.py +++ /dev/null @@ -1,70 +0,0 @@ -import pandas as pd -import itertools - - - -def dailyFeaturesFromSummaryData(sleep_daily_features, sleep_summary_data, summary_features, sleep_type): - if sleep_type == "main": - sleep_summary_data = sleep_summary_data[sleep_summary_data["is_main_sleep"] == 1] - elif sleep_type == "nap": - sleep_summary_data = sleep_summary_data[sleep_summary_data["is_main_sleep"] == 0] - elif sleep_type == "all": - pass - else: - raise ValueError("sleep_type can only be one of ['main', 'nap', 'all'].") - - features_sum = sleep_summary_data[["minutes_after_wakeup", "minutes_asleep", "minutes_awake", "minutes_to_fall_asleep", "minutes_in_bed", "local_end_date"]].groupby(["local_end_date"]).sum() - features_sum.index.rename("local_date", inplace=True) - if "sumdurationafterwakeup" in summary_features: - sleep_daily_features = sleep_daily_features.join(features_sum[["minutes_after_wakeup"]], how="outer").rename(columns={"minutes_after_wakeup": "sleep_daily_sumdurationafterwakeup" + sleep_type}) - if "sumdurationasleep" in summary_features: - sleep_daily_features = sleep_daily_features.join(features_sum[["minutes_asleep"]], how="outer").rename(columns={"minutes_asleep": "sleep_daily_sumdurationasleep" + sleep_type}) - if "sumdurationawake" in summary_features: - sleep_daily_features = sleep_daily_features.join(features_sum[["minutes_awake"]], how="outer").rename(columns={"minutes_awake": "sleep_daily_sumdurationawake" + sleep_type}) - if "sumdurationtofallasleep" in summary_features: - sleep_daily_features = sleep_daily_features.join(features_sum[["minutes_to_fall_asleep"]], how="outer").rename(columns={"minutes_to_fall_asleep": "sleep_daily_sumdurationtofallasleep" + sleep_type}) - if "sumdurationinbed" in summary_features: - sleep_daily_features = sleep_daily_features.join(features_sum[["minutes_in_bed"]], how="outer").rename(columns={"minutes_in_bed": "sleep_daily_sumdurationinbed" + sleep_type}) - - features_avg = sleep_summary_data[["efficiency", "local_end_date"]].groupby(["local_end_date"]).mean() - features_avg.index.rename("local_date", inplace=True) - if "avgefficiency" in summary_features: - sleep_daily_features = sleep_daily_features.join(features_avg[["efficiency"]], how="outer").rename(columns={"efficiency": "sleep_daily_avgefficiency" + sleep_type}) - - features_count = sleep_summary_data[["local_start_date_time", "local_end_date"]].groupby(["local_end_date"]).count() - features_count.index.rename("local_date", inplace=True) - if "countepisode" in summary_features: - sleep_daily_features = sleep_daily_features.join(features_count[["local_start_date_time"]], how="outer").rename(columns={"local_start_date_time": "sleep_daily_countepisode" + sleep_type}) - - return sleep_daily_features - -def base_fitbit_sleep_features(sleep_summary_data, day_segment, requested_summary_features, requested_sleep_type): - if not day_segment == "daily": - return pd.DataFrame(columns=["local_date"]) - else: - # name of the features this function can compute - base_summary_features_names = ["sumdurationafterwakeup", "sumdurationasleep", "sumdurationawake", "sumdurationtofallasleep", "sumdurationinbed", "avgefficiency", "countepisode"] - base_sleep_type = ["main", "nap", "all"] - # the subset of requested features this function can compute - summary_features_to_compute = list(set(requested_summary_features) & set(base_summary_features_names)) - sleep_type_to_compute = list(set(requested_sleep_type) & set(base_sleep_type)) - # full names - features_fullnames_to_compute = ["".join(feature) for feature in itertools.product(summary_features_to_compute, sleep_type_to_compute)] - - colnames_can_be_zero = ["sleep_daily_" + x for x in [col for col in features_fullnames_to_compute if "avgefficiency" not in col]] - - if sleep_summary_data.empty: - sleep_summary_features = pd.DataFrame(columns=["local_date"] + ["sleep_daily_" + x for x in features_fullnames_to_compute]) - else: - - sleep_summary_features = pd.DataFrame() - - for sleep_type in sleep_type_to_compute: - sleep_summary_features = dailyFeaturesFromSummaryData(sleep_summary_features, sleep_summary_data, summary_features_to_compute, sleep_type) - - sleep_summary_features[colnames_can_be_zero] = sleep_summary_features[colnames_can_be_zero].fillna(0) - - sleep_summary_features = sleep_summary_features.reset_index() - - return sleep_summary_features - diff --git a/src/features/fitbit_sleep_features.py b/src/features/fitbit_sleep_features.py deleted file mode 100644 index 314c13d5..00000000 --- a/src/features/fitbit_sleep_features.py +++ /dev/null @@ -1,18 +0,0 @@ -import pandas as pd -from fitbit_sleep.fitbit_sleep_base import base_fitbit_sleep_features -import itertools - -sleep_summary_data = pd.read_csv(snakemake.input["sleep_summary_data"]) -requested_summary_features = snakemake.params["summary_features"] -requested_sleep_type = snakemake.params["sleep_types"] -day_segment = snakemake.params["day_segment"] -sleep_features = pd.DataFrame(columns=["local_date"]) - -sleep_features = sleep_features.merge(base_fitbit_sleep_features(sleep_summary_data, day_segment, requested_summary_features, requested_sleep_type), on="local_date", how="outer") - -requested_features = ["".join(feature) for feature in itertools.product(requested_summary_features, requested_sleep_type)] if day_segment == "daily" else [] - -assert len(requested_features) + 1 == sleep_features.shape[1], "The number of features in the output dataframe (=" + str(sleep_features.shape[1]) + ") does not match the expected value (=" + str(len(requested_features)) + " + 1). Verify your fitbit sleep feature extraction functions" - -sleep_features.to_csv(snakemake.output[0], index=False) - diff --git a/src/features/fitbit_sleep_summary/rapids/main.py b/src/features/fitbit_sleep_summary/rapids/main.py new file mode 100644 index 00000000..f42f750e --- /dev/null +++ b/src/features/fitbit_sleep_summary/rapids/main.py @@ -0,0 +1,91 @@ +import pandas as pd +import itertools + +def extractSleepFeaturesFromSummaryData(sleep_summary_data, summary_features, sleep_type, sleep_summary_features): + if sleep_type == "main": + sleep_summary_data = sleep_summary_data[sleep_summary_data["is_main_sleep"] == 1] + elif sleep_type == "nap": + sleep_summary_data = sleep_summary_data[sleep_summary_data["is_main_sleep"] == 0] + elif sleep_type == "all": + pass + else: + raise ValueError("sleep_type can only be one of ['main', 'nap', 'all'].") + + features_sum = sleep_summary_data[["local_segment", "minutes_after_wakeup", "minutes_asleep", "minutes_awake", "minutes_to_fall_asleep", "minutes_in_bed"]].groupby(["local_segment"]).sum() + + if "summarysumdurationafterwakeup" in summary_features: + sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_after_wakeup"]], how="outer").rename(columns={"minutes_after_wakeup": "sleep_rapids_summarysumdurationafterwakeup" + sleep_type}) + if "summarysumdurationasleep" in summary_features: + sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_asleep"]], how="outer").rename(columns={"minutes_asleep": "sleep_rapids_summarysumdurationasleep" + sleep_type}) + if "summarysumdurationawake" in summary_features: + sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_awake"]], how="outer").rename(columns={"minutes_awake": "sleep_rapids_summarysumdurationawake" + sleep_type}) + if "summarysumdurationtofallasleep" in summary_features: + sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_to_fall_asleep"]], how="outer").rename(columns={"minutes_to_fall_asleep": "sleep_rapids_summarysumdurationtofallasleep" + sleep_type}) + if "summarysumdurationinbed" in summary_features: + sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_in_bed"]], how="outer").rename(columns={"minutes_in_bed": "sleep_rapids_summarysumdurationinbed" + sleep_type}) + + features_avg = sleep_summary_data[["local_segment", "efficiency", "minutes_after_wakeup", "minutes_asleep", "minutes_awake", "minutes_to_fall_asleep", "minutes_in_bed"]].groupby(["local_segment"]).mean() + + if "summaryavgefficiency" in summary_features: + sleep_summary_features = sleep_summary_features.join(features_avg[["efficiency"]], how="outer").rename(columns={"efficiency": "sleep_rapids_summaryavgefficiency" + sleep_type}) + if "summaryavgdurationafterwakeup" in summary_features: + sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_after_wakeup"]], how="outer").rename(columns={"minutes_after_wakeup": "sleep_rapids_summaryavgdurationafterwakeup" + sleep_type}) + if "summaryavgdurationasleep" in summary_features: + sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_asleep"]], how="outer").rename(columns={"minutes_asleep": "sleep_rapids_summaryavgdurationasleep" + sleep_type}) + if "summaryavgdurationawake" in summary_features: + sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_awake"]], how="outer").rename(columns={"minutes_awake": "sleep_rapids_summaryavgdurationawake" + sleep_type}) + if "summaryavgdurationtofallasleep" in summary_features: + sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_to_fall_asleep"]], how="outer").rename(columns={"minutes_to_fall_asleep": "sleep_rapids_summaryavgdurationtofallasleep" + sleep_type}) + if "summaryavgdurationinbed" in summary_features: + sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_in_bed"]], how="outer").rename(columns={"minutes_in_bed": "sleep_rapids_summaryavgdurationinbed" + sleep_type}) + + features_count = sleep_summary_data[["local_segment", "timestamp"]].groupby(["local_segment"]).count() + + if "summarycountepisode" in summary_features: + sleep_summary_features = sleep_summary_features.join(features_count[["timestamp"]], how="outer").rename(columns={"timestamp": "sleep_rapids_summarycountepisode" + sleep_type}) + + return sleep_summary_features + + +def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): + + sleep_summary_data = pd.read_csv(sensor_data_files["sensor_data"]) + + requested_summary_features = ["summary" + x for x in provider["FEATURES"]] + requested_sleep_types = provider["SLEEP_TYPES"] + + # name of the features this function can compute + base_summary_features = ["summarycountepisode", "summaryavgefficiency", "summarysumdurationafterwakeup", "summarysumdurationasleep", "summarysumdurationawake", "summarysumdurationtofallasleep", "summarysumdurationinbed", "summaryavgdurationafterwakeup", "summaryavgdurationasleep", "summaryavgdurationawake", "summaryavgdurationtofallasleep", "summaryavgdurationinbed"] + base_sleep_types = ["main", "nap", "all"] + # the subset of requested features this function can compute + summary_features_to_compute = list(set(requested_summary_features) & set(base_summary_features)) + sleep_types_to_compute = list(set(requested_sleep_types) & set(base_sleep_types)) + # full names + features_fullnames_to_compute = ["".join(feature) for feature in itertools.product(summary_features_to_compute, sleep_types_to_compute)] + + colnames_can_be_zero = ["sleep_rapids_" + x for x in [col for col in features_fullnames_to_compute if "summaryavgefficiency" not in col]] + + # extract features from summary data + sleep_summary_features = pd.DataFrame(columns=["local_segment"] + ["sleep_rapids_" + x for x in features_fullnames_to_compute]) + if not sleep_summary_data.empty: + sleep_summary_data = filter_data_by_segment(sleep_summary_data, day_segment) + + if not sleep_summary_data.empty: + # only keep the segments start at 00:00:00 and end at 23:59:59 + datetime_start_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 00:00:00" + datetime_end_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 23:59:59" + + segment_regex = "{}#{},{}".format(day_segment, datetime_start_regex, datetime_end_regex) + sleep_summary_data = sleep_summary_data[sleep_summary_data["local_segment"].str.match(segment_regex)] + + if not sleep_summary_data.empty: + sleep_summary_features = pd.DataFrame() + + for sleep_type in sleep_types_to_compute: + sleep_summary_features = extractSleepFeaturesFromSummaryData(sleep_summary_data, summary_features_to_compute, sleep_type, sleep_summary_features) + + sleep_summary_features[colnames_can_be_zero] = sleep_summary_features[colnames_can_be_zero].fillna(0) + + sleep_summary_features = sleep_summary_features.reset_index() + + return sleep_summary_features diff --git a/src/features/fitbit_step_features.py b/src/features/fitbit_step_features.py deleted file mode 100644 index e0ca3523..00000000 --- a/src/features/fitbit_step_features.py +++ /dev/null @@ -1,66 +0,0 @@ -import pandas as pd -import numpy as np -import time -from fitbit_step.fitbit_step_base import base_fitbit_step_features - -def isInvalidTime(str_time): - try: - time.strptime(str_time, '%H:%M') - return False - except ValueError: - return True - -def isInMainSleep(local_date_time, sleep): - # sleep_period_container = sleep.query("local_start_date_time <= @local_date_time <= local_end_date_time") - sleep_period_container = sleep[(sleep["local_start_date_time"] <= local_date_time) & (local_date_time <= sleep["local_end_date_time"])] - if sleep_period_container.shape[0] >= 1: - return True - else: - return False - -def getStepsOutsideFitbitMainSleep(sleep, steps): - steps['inMainSleep'] = steps.apply(lambda row : isInMainSleep(row['local_date_time'], sleep), axis = 1) - return steps[steps['inMainSleep'] == False] - - -def getStepsOutsideFixedMainSleep(sleepStart, sleepEnd, steps): - steps = steps.set_index('local_date_time') - steps['inMainSleep'] = False - steps.loc[steps.between_time(sleepStart, sleepEnd).index, 'inMainSleep'] = True - steps.reset_index(level=0, inplace=True) - return steps[steps['inMainSleep'] == False] - -step_data = pd.read_csv(snakemake.input["step_data"], parse_dates=["local_date_time", "local_date"]) -day_segment = snakemake.params["day_segment"] -threshold_active_bout = snakemake.params["threshold_active_bout"] -include_zero_step_rows = snakemake.params["include_zero_step_rows"] -exclude_sleep = snakemake.params["exclude_sleep"] -exclude_sleep_type = snakemake.params["exclude_sleep_type"] -exclude_sleep_fixed_start = snakemake.params["exclude_sleep_fixed_start"] -exclude_sleep_fixed_end = snakemake.params["exclude_sleep_fixed_end"] - -step_features = pd.DataFrame(columns=["local_date"]) -requested_features = {} -requested_features["features_all_steps"] = snakemake.params["features_all_steps"] -requested_features["features_sedentary_bout"] = [feature + "sedentarybout" for feature in snakemake.params["features_sedentary_bout"]] -requested_features["features_active_bout"] = [feature + "activebout" for feature in snakemake.params["features_active_bout"]] - -if exclude_sleep == True: - if exclude_sleep_type == "FIXED": - if isInvalidTime(exclude_sleep_fixed_start): - raise ValueError("Your fixed start time has an invalid format in your config.yml file") - if isInvalidTime(exclude_sleep_fixed_end): - raise ValueError("Your fixed end time has an invalid format in your config.yml file") - step_data = getStepsOutsideFixedMainSleep(exclude_sleep_fixed_start, exclude_sleep_fixed_end, step_data) - elif exclude_sleep_type == "FITBIT_BASED": - sleep_data = pd.read_csv(snakemake.input["sleep_data"], parse_dates=["local_start_date_time", "local_end_date_time"]) - step_data = getStepsOutsideFitbitMainSleep(sleep_data, step_data) - else: - raise ValueError("We only support FIXED or FITBIT_BASED to filter step data based on sleep data. You typed " + exclude_sleep_type + ", Check your config.yaml file for typos") - -step_features = step_features.merge(base_fitbit_step_features(step_data, day_segment, requested_features, threshold_active_bout, include_zero_step_rows), on="local_date", how="outer") - - -assert np.sum([len(x) for x in requested_features.values()]) + 1 == step_features.shape[1], "The number of features in the output dataframe (=" + str(step_features.shape[1]) + ") does not match the expected value (=" + str(np.sum([len(x) for x in requested_features.values()])) + " + 1). Verify your fitbit step feature extraction functions" - -step_features.to_csv(snakemake.output[0], index=False) \ No newline at end of file