From 8377c12efb8e307546bf6c87476c0caaf8e2d93e Mon Sep 17 00:00:00 2001 From: Meng Li <34143965+Meng6@users.noreply.github.com> Date: Thu, 18 Feb 2021 20:01:33 -0500 Subject: [PATCH] Add sleep intraday features with RAPIDS provider --- Snakefile | 15 +- config.yaml | 26 ++ docs/features/fitbit-sleep-intraday.md | 41 ++- rules/features.smk | 34 +++ src/data/fitbit_parse_sleep.py | 162 +++++------ .../fitbit_sleep_intraday/rapids/main.py | 265 ++++++++++++++++++ src/features/utils/utils.py | 2 +- 7 files changed, 441 insertions(+), 104 deletions(-) create mode 100644 src/features/fitbit_sleep_intraday/rapids/main.py diff --git a/Snakefile b/Snakefile index 1e4360a5..136773c0 100644 --- a/Snakefile +++ b/Snakefile @@ -257,11 +257,16 @@ for provider in config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") -# for provider in config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"].keys(): -# if config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]: -# files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_intraday_raw.csv", pid=config["PIDS"])) -# files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_intraday_parsed.csv", pid=config["PIDS"])) -# files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_intraday_parsed_with_datetime.csv", pid=config["PIDS"])) +for provider in config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"].keys(): + if config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_intraday_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_intraday_parsed.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_intraday_episodes_resampled.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_intraday_episodes_resampled_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_intraday_features/fitbit_sleep_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_sleep_intraday.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") for provider in config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"].keys(): if config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][provider]["COMPUTE"]: diff --git a/config.yaml b/config.yaml index 43734157..a1717419 100644 --- a/config.yaml +++ b/config.yaml @@ -379,6 +379,32 @@ FITBIT_SLEEP_SUMMARY: SRC_FOLDER: "rapids" # inside src/features/fitbit_sleep_summary SRC_LANGUAGE: "python" +# See https://www.rapids.science/latest/features/fitbit-sleep-intraday/ +FITBIT_SLEEP_INTRADAY: + TABLE: fitbit_data + INCLUDE_SLEEP_LATER_THAN: &include_sleep_later_than + 0 # a number ranged from 0 (midnight) to 1439 (23:59) + REFERENCE_TIME: &reference_time + MIDNIGHT # chosen from "MIDNIGHT" and "START_OF_THE_SEGMENT" + PROVIDERS: + RAPIDS: + COMPUTE: False + FEATURES: + LEVELS_AND_TYPES_COMBINING_ALL: True + LEVELS_AND_TYPES: [countepisode, sumduration, maxduration, minduration, avgduration, medianduration, stdduration] + RATIOS_TYPE: [count, duration] + RATIOS_SCOPE: [ACROSS_LEVELS, ACROSS_TYPES, WITHIN_LEVELS, WITHIN_TYPES] + ROUTINE: [starttimefirstmainsleep, endtimelastmainsleep, starttimefirstnap, endtimelastnap] + SLEEP_LEVELS: + CLASSIC: [awake, restless, asleep] + STAGES: [wake, deep, light, rem] + UNIFIED: [awake, asleep] + SLEEP_TYPES: [main, nap] + INCLUDE_SLEEP_LATER_THAN: *include_sleep_later_than + REFERENCE_TIME: *reference_time + SRC_FOLDER: "rapids" # inside src/features/fitbit_sleep_intraday + SRC_LANGUAGE: "python" + # See https://www.rapids.science/latest/features/fitbit-steps-summary/ FITBIT_STEPS_SUMMARY: TABLE: steps_summary diff --git a/docs/features/fitbit-sleep-intraday.md b/docs/features/fitbit-sleep-intraday.md index 7425f6b8..14093066 100644 --- a/docs/features/fitbit-sleep-intraday.md +++ b/docs/features/fitbit-sleep-intraday.md @@ -5,7 +5,7 @@ Sensor parameters description for `[FITBIT_SLEEP_INTRADAY]`: |Key                              | Description | |----------------|----------------------------------------------------------------------------------------------------------------------------------- |`[TABLE]`| Database table name or file path where the sleep intraday data is stored. The configuration keys in [Device Data Source Configuration](../../setup/configuration/#device-data-source-configuration) control whether this parameter is interpreted as table or file. -|`[INCLUDE_EPISODES_LATER_THAN]`| All sleep episodes that started after this time will be included in the feature computation. It is a number ranging from 0 (midnight) to 1439 (23:59) which denotes the number of minutes after midnight. If a segment is longer than one day, this value is for every day. +|`[INCLUDE_SLEEP_LATER_THAN]`| All resampled sleep rows (bin interval: one minute) that started after this time will be included in the feature computation. It is a number ranging from 0 (midnight) to 1439 (23:59) which denotes the number of minutes after midnight. If a segment is longer than one day, this value is for every day. |`[REFERENCE_TIME]`| The reference point from which the `[ROUTINE]` features are to be computed. Chosen from `MIDNIGHT` and `START_OF_THE_SEGMENT`, default is `MIDNIGHT`. If you have multiple time segments per day it might be more informative to set this flag to `START_OF_THE_SEGMENT`. The format of the column(s) containing the Fitbit sensor data can be `JSON` or `PLAIN_TEXT`. The data in `JSON` format is obtained directly from the Fitbit API. We support `PLAIN_TEXT` in case you already parsed your data and don't have access to your participants' Fitbit accounts anymore. If your data is in `JSON` format then summary and intraday data come packed together. @@ -23,7 +23,14 @@ We provide examples of the input format that RAPIDS expects, note that both exam |a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"sleep": [{"awakeCount": 1, "awakeDuration": 3, "awakeningsCount": 8, "dateOfSleep": "2020-10-09", "duration": 19320000, "efficiency": 96, "endTime": "2020-10-09T05:57:30.000", "isMainSleep": true, "logId": 14161136803, "minuteData": [{"dateTime": "00:35:30", "value": "2"}, {"dateTime": "00:36:30", "value": "1"}, {"dateTime": "00:37:30", "value": "1"},...], "minutesAfterWakeup": 0, "minutesAsleep": 309, "minutesAwake": 13, "minutesToFallAsleep": 0, "restlessCount": 7, "restlessDuration": 10, "startTime": "2020-10-09T00:35:30.000", "timeInBed": 322}], "summary": {"totalMinutesAsleep": 309, "totalSleepRecords": 1, "totalTimeInBed": 322}} === "PLAIN_TEXT" - Will update this section later. + + All columns are mandatory, however, all except `local_date_time` and `duration` can be empty if you don't have that data. Just have in mind that some features might be inaccurate or empty as `type_episode_id`, `level`, `is_main_sleep`, and `type` are used for sleep episodes extraction. `type_episode_id` is based on where it is extracted: if it is extracted from the 1st "minutesData" block, the `type_episode_id` field will be 0. Similarly, the kth block will be k-1. + + |type_episode_id |local_date_time |duration |level |is_main_sleep |type | + |---------------- |------------------- |--------- |---------- |-------------- |-------------- | + |0 |2020-10-07 15:55:00 |60 |awake |0 |classic | + |0 |2020-10-07 15:56:00 |60 |awake |0 |classic | + |0 |2020-10-07 15:57:00 |60 |restless |0 |classic | ??? example "Example of the structure of source data with Fitbit’s sleep API Version 1.2" @@ -36,7 +43,14 @@ We provide examples of the input format that RAPIDS expects, note that both exam |a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"sleep":[{"dateOfSleep":"2020-10-12","duration":28980000,"efficiency":93,"endTime":"2020-10-12T09:34:30.000","infoCode":0,"isMainSleep":true,"levels":{"data":[{"dateTime":"2020-10-12T01:31:00.000","level":"wake","seconds":600},{"dateTime":"2020-10-12T01:41:00.000","level":"light","seconds":60},{"dateTime":"2020-10-12T01:42:00.000","level":"deep","seconds":2340},...], "summary":{"deep":{"count":4,"minutes":63,"thirtyDayAvgMinutes":59},"light":{"count":27,"minutes":257,"thirtyDayAvgMinutes":364},"rem":{"count":5,"minutes":94,"thirtyDayAvgMinutes":58},"wake":{"count":24,"minutes":69,"thirtyDayAvgMinutes":95}}},"logId":26589710673,"minutesAfterWakeup":0,"minutesAsleep":415,"minutesAwake":68,"minutesToFallAsleep":0,"startTime":"2020-10-12T01:31:00.000","timeInBed":483,"type":"stages"}],"summary":{"stages":{"deep":63,"light":257,"rem":94,"wake":69},"totalMinutesAsleep":415,"totalSleepRecords":1,"totalTimeInBed":483}} === "PLAIN_TEXT" - Will update this section later. + + All columns are mandatory, however, all except `local_date_time` and `duration` can be empty if you don't have that data. Just have in mind that some features might be inaccurate or empty as `type_episode_id`, `level`, `is_main_sleep`, and `type` are used for sleep episodes extraction. `type_episode_id` is based on where it is extracted: if it is extracted from the 1st "data" and "shortData" block, the `type_episode_id` field will be 0. Similarly, the kth block will be k-1. + + |type_episode_id |local_date_time |duration |level |is_main_sleep |type | + |---------------- |------------------- |--------- |---------- |-------------- |-------------- | + |0 |2020-10-10 15:36:30 |60 |restless |1 |stages | + |0 |2020-10-10 15:37:30 |660 |asleep |1 |stages | + |0 |2020-10-10 15:48:30 |60 |restless |1 |stages | ## RAPIDS provider @@ -45,10 +59,10 @@ We provide examples of the input format that RAPIDS expects, note that both exam !!! info "File Sequence" ```bash - # [might update this section later] - data/raw/{pid}/fitbit_sleep_intraday_raw.csv - data/raw/{pid}/fitbit_sleep_intraday_parsed.csv - - data/raw/{pid}/fitbit_sleep_intraday_parsed_with_datetime.csv + - data/interim/{pid}/fitbit_sleep_intraday_episodes_resampled.csv + - data/interim/{pid}/fitbit_sleep_intraday_episodes_resampled_with_datetime.csv - data/interim/{pid}/fitbit_sleep_intraday_features/fitbit_sleep_intraday_{language}_{provider_key}.csv - data/processed/features/{pid}/fitbit_sleep_intraday.csv ``` @@ -67,13 +81,14 @@ Parameters description for `[FITBIT_SLEEP_INTRADAY][PROVIDERS][RAPIDS]`: Features description for `[FITBIT_STEPS_INTRADAY][PROVIDERS][RAPIDS][LEVELS_AND_TYPES]`: |Feature                                           |Units |Description | -|-------------------------- |-------------- |-------------------------------------------------------------| -|countepisode`[LEVEL][TYPE]` |episodes |Number of `[LEVEL][TYPE]`sleep episodes. `[LEVEL]`is one of `[SLEEP_LEVELS]` (e.g. awake-classic or rem-stages) and `[TYPE]` is one of `[SLEEP_TYPES]` (e.g. main). Both `[LEVEL]`and `[TYPE]`can also be `all` when ``LEVELS_AND_TYPES_COMBINING_ALL`` is True, which groups all `CLASSIC`, `STAGE`, `UNIFIED` levels and both sleep types. -|sumduration`[LEVEL][TYPE]` |minutes |Total duration of all `[LEVEL][TYPE]`sleep episodes. `[LEVEL]`is one of `[SLEEP_LEVELS]` (e.g. awake-classic or rem-stages) and `[TYPE]` is one of `[SLEEP_TYPES]` (e.g. main). Both `[LEVEL]`and `[TYPE]`can also be `all` when `LEVELS_AND_TYPES_COMBINING_ALL` is True, which groups all `CLASSIC`, `STAGE`, `UNIFIED` levels and both sleep types. -|maxduration`[LEVEL][TYPE]` |minutes | Longest duration of any `[LEVEL][TYPE]`sleep episode. `[LEVEL]`is one of `[SLEEP_LEVELS]` (e.g. awake-classic or rem-stages) and `[TYPE]` is one of `[SLEEP_TYPES]` (e.g. main). Both `[LEVEL]`and `[TYPE]`can also be `all` when `LEVELS_AND_TYPES_COMBINING_ALL` is True, which groups all `CLASSIC`, `STAGE`, `UNIFIED` levels and both sleep types. -|minduration`[LEVEL][TYPE]` |minutes | Shortest duration of any `[LEVEL][TYPE]`sleep episode. `[LEVEL]`is one of `[SLEEP_LEVELS]` (e.g. awake-classic or rem-stages) and `[TYPE]` is one of `[SLEEP_TYPES]` (e.g. main). Both `[LEVEL]`and `[TYPE]`can also be `all` when `LEVELS_AND_TYPES_COMBINING_ALL` is True, which groups all `CLASSIC`, `STAGE`, `UNIFIED` levels and both sleep types. -|avgduration`[LEVEL][TYPE]` |minutes | Average duration of all `[LEVEL][TYPE]`sleep episodes. `[LEVEL]`is one of `[SLEEP_LEVELS]` (e.g. awake-classic or rem-stages) and `[TYPE]` is one of `[SLEEP_TYPES]` (e.g. main). Both `[LEVEL]`and `[TYPE]`can also be `all` when `LEVELS_AND_TYPES_COMBINING_ALL` is True, which groups all `CLASSIC`, `STAGE`, `UNIFIED` levels and both sleep types. -|stdduration`[LEVEL][TYPE]` |minutes | Standard deviation duration of all `[LEVEL][TYPE]`sleep episodes. `[LEVEL]`is one of `[SLEEP_LEVELS]` (e.g. awake-classic or rem-stages) and `[TYPE]` is one of `[SLEEP_TYPES]` (e.g. main). Both `[LEVEL]`and `[TYPE]`can also be `all` when `LEVELS_AND_TYPES_COMBINING_ALL` is True, which groups all `CLASSIC`, `STAGE`, `UNIFIED` levels and both sleep types. +|------------------------------- |-------------- |-------------------------------------------------------------| +|countepisode`[LEVEL][TYPE]` |episodes |Number of `[LEVEL][TYPE]`sleep episodes. `[LEVEL]`is one of `[SLEEP_LEVELS]` (e.g. awake-classic or rem-stages) and `[TYPE]` is one of `[SLEEP_TYPES]` (e.g. main). Both `[LEVEL]`and `[TYPE]` can also be `all` when ``LEVELS_AND_TYPES_COMBINING_ALL`` is True, which ignores the levels and groups by sleep types. +|sumduration`[LEVEL][TYPE]` |minutes |Total duration of all `[LEVEL][TYPE]`sleep episodes. `[LEVEL]`is one of `[SLEEP_LEVELS]` (e.g. awake-classic or rem-stages) and `[TYPE]` is one of `[SLEEP_TYPES]` (e.g. main). Both `[LEVEL]` and `[TYPE]`can also be `all` when `LEVELS_AND_TYPES_COMBINING_ALL` is True, which ignores the levels and groups by sleep types. +|maxduration`[LEVEL][TYPE]` |minutes | Longest duration of any `[LEVEL][TYPE]`sleep episode. `[LEVEL]`is one of `[SLEEP_LEVELS]` (e.g. awake-classic or rem-stages) and `[TYPE]` is one of `[SLEEP_TYPES]` (e.g. main). Both `[LEVEL]` and `[TYPE]`can also be `all` when `LEVELS_AND_TYPES_COMBINING_ALL` is True, which ignores the levels and groups by sleep types. +|minduration`[LEVEL][TYPE]` |minutes | Shortest duration of any `[LEVEL][TYPE]`sleep episode. `[LEVEL]`is one of `[SLEEP_LEVELS]` (e.g. awake-classic or rem-stages) and `[TYPE]` is one of `[SLEEP_TYPES]` (e.g. main). Both `[LEVEL]` and `[TYPE]`can also be `all` when `LEVELS_AND_TYPES_COMBINING_ALL` is True, which ignores the levels and groups by sleep types. +|avgduration`[LEVEL][TYPE]` |minutes | Average duration of all `[LEVEL][TYPE]`sleep episodes. `[LEVEL]`is one of `[SLEEP_LEVELS]` (e.g. awake-classic or rem-stages) and `[TYPE]` is one of `[SLEEP_TYPES]` (e.g. main). Both `[LEVEL]` and `[TYPE]`can also be `all` when `LEVELS_AND_TYPES_COMBINING_ALL` is True, which ignores the levels and groups by sleep types. +|medianduration`[LEVEL][TYPE]` |minutes | Median duration of all `[LEVEL][TYPE]`sleep episodes. `[LEVEL]`is one of `[SLEEP_LEVELS]` (e.g. awake-classic or rem-stages) and `[TYPE]` is one of `[SLEEP_TYPES]` (e.g. main). Both `[LEVEL]` and `[TYPE]`can also be `all` when `LEVELS_AND_TYPES_COMBINING_ALL` is True, which ignores the levels and groups by sleep types. +|stdduration`[LEVEL][TYPE]` |minutes | Standard deviation duration of all `[LEVEL][TYPE]`sleep episodes. `[LEVEL]`is one of `[SLEEP_LEVELS]` (e.g. awake-classic or rem-stages) and `[TYPE]` is one of `[SLEEP_TYPES]` (e.g. main). Both `[LEVEL]` and `[TYPE]`can also be `all` when `LEVELS_AND_TYPES_COMBINING_ALL` is True, which ignores the levels and groups by sleep types. Features description for `[FITBIT_STEPS_INTRADAY][PROVIDERS][RAPIDS]` RATIOS `[ACROSS_LEVELS]`: @@ -122,7 +137,7 @@ Features description for `[FITBIT_STEPS_INTRADAY][PROVIDERS][RAPIDS][ROUTINE]`: !!! note "Assumptions/Observations" - 1. Deleting values from `[SLEEP_LEVELS]` or `[SLEEP_TYPES]` will only change the features you receive from `[LEVELS_AND_TYPES]`. For example if `STAGES` only contains `[rem, light]` you will not receive `countepisode[wake|deep][TYPE]` or sum, max, min, avg, or std `duration`. These values will not influence `RATIOS` or `ROUTINE` features. + 1. Deleting values from `[SLEEP_LEVELS]` or `[SLEEP_TYPES]` will only change the features you receive from `[LEVELS_AND_TYPES]`. For example if `STAGES` only contains `[rem, light]` you will not receive `countepisode[wake|deep][TYPE]` or sum, max, min, avg, median, or std `duration`. These values will not influence `RATIOS` or `ROUTINE` features. 2. Any `[LEVEL]` grouping is done within the elements of each class `CLASSIC`, `STAGES`, and `UNIFIED`. That is, we never combine `CLASSIC` or `STAGES` types to compute features when `LEVELS_AND_TYPES_COMBINING_ALL` is True or when computing `RATIOS`. diff --git a/rules/features.smk b/rules/features.smk index 7c8edf9c..77190b04 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -660,6 +660,40 @@ rule fitbit_sleep_summary_r_features: script: "../src/features/entry.R" +rule resample_sleep_episodes: + input: + "data/raw/{pid}/fitbit_sleep_intraday_parsed.csv" + output: + "data/interim/{pid}/fitbit_sleep_intraday_episodes_resampled.csv" + script: + "../src/features/utils/resample_episodes.R" + +rule fitbit_sleep_intraday_python_features: + input: + sensor_data = "data/interim/{pid}/fitbit_sleep_intraday_episodes_resampled_with_datetime.csv", + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" + params: + provider = lambda wildcards: config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"][wildcards.provider_key.upper()], + provider_key = "{provider_key}", + sensor_key = "fitbit_sleep_intraday" + output: + "data/interim/{pid}/fitbit_sleep_intraday_features/fitbit_sleep_intraday_python_{provider_key}.csv" + script: + "../src/features/entry.py" + +rule fitbit_sleep_intraday_r_features: + input: + sensor_data = "data/interim/{pid}/fitbit_sleep_intraday_episodes_resampled_with_datetime.csv", + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" + params: + provider = lambda wildcards: config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"][wildcards.provider_key.upper()], + provider_key = "{provider_key}", + sensor_key = "fitbit_sleep_intraday" + output: + "data/interim/{pid}/fitbit_sleep_intraday_features/fitbit_sleep_intraday_r_{provider_key}.csv" + script: + "../src/features/entry.R" + rule merge_sensor_features_for_individual_participants: input: feature_files = input_merge_sensor_features_for_individual_participants diff --git a/src/data/fitbit_parse_sleep.py b/src/data/fitbit_parse_sleep.py index 1995fb01..bf573437 100644 --- a/src/data/fitbit_parse_sleep.py +++ b/src/data/fitbit_parse_sleep.py @@ -14,66 +14,53 @@ SLEEP_SUMMARY_COLUMNS_V1_2 = ("device_id", "efficiency", "timestamp") SLEEP_SUMMARY_COLUMNS_V1 = SLEEP_SUMMARY_COLUMNS_V1_2 + ("count_awake", "duration_awake", "count_awakenings", "count_restless", "duration_restless") -SLEEP_INTRADAY_COLUMNS = ("device_id", +SLEEP_INTRADAY_COLUMNS = (# Extract "type_episode_id" field based on summary data: start from 0 + "type_episode_id", + "duration", # For "classic" type, original_level is one of {"awake", "restless", "asleep"} # For "stages" type, original_level is one of {"wake", "deep", "light", "rem"} "level", # For "classic" type, unified_level is one of {0, 1} where 0: awake {"awake" + "restless"}, 1: asleep {"asleep"} # For "stages" type, unified_level is one of {0, 1} where 0: awake {"wake"}, 1: asleep {"deep" + "light" + "rem"} "unified_level", - # one of {0, 1} where 0: nap, 1: main sleep + # One of {0, 1} where 0: nap, 1: main sleep "is_main_sleep", - # one of {"classic", "stages"} + # One of {"classic", "stages"} "type", "local_date_time", - "timestamp") + "start_timestamp", + "end_timestamp") -def mergeLongAndShortData(data_summary): - longData = pd.DataFrame(columns=['dateTime', 'level', 'seconds']) - shortData = pd.DataFrame(columns=['dateTime','level', 'seconds']) - windowLength = 30 +def mergeLongAndShortData(data_intraday): + long_data = pd.DataFrame(columns=["dateTime", "level"]) + short_data = pd.DataFrame(columns=["dateTime", "level"]) - for data in data_summary['data']: - origEntry = data + window_length = 30 + + for data in data_intraday["data"]: counter = 0 - numberOfSplits = origEntry['seconds']//windowLength - for times in range(numberOfSplits): - newRow = {'dateTime':dateutil.parser.parse(origEntry['dateTime'])+timedelta(seconds=counter*windowLength),'level':origEntry['level'],'seconds':windowLength} - longData = longData.append(newRow, ignore_index = True) + for times in range(data["seconds"] // window_length): + row = {"dateTime": dateutil.parser.parse(data["dateTime"])+timedelta(seconds=counter*window_length), "level": data["level"]} + long_data = long_data.append(row, ignore_index = True) counter = counter + 1 - for data in data_summary['shortData']: - origEntry = data + for data in data_intraday["shortData"]: counter = 0 - numberOfSplits = origEntry['seconds']//windowLength - for times in range(numberOfSplits): - newRow = {'dateTime':dateutil.parser.parse(origEntry['dateTime'])+timedelta(seconds=counter*windowLength),'level':origEntry['level'],'seconds':windowLength} - shortData = shortData.append(newRow,ignore_index = True) + for times in range(data["seconds"] // window_length): + row = {"dateTime": dateutil.parser.parse(data["dateTime"])+timedelta(seconds=counter*window_length), "level": data["level"]} + short_data = short_data.append(row, ignore_index = True) counter = counter + 1 - longData.set_index('dateTime',inplace=True) - shortData.set_index('dateTime',inplace=True) - longData['level'] = np.where(longData.index.isin(shortData.index) == True,'wake',longData['level']) + long_data.set_index("dateTime",inplace=True) + short_data.set_index("dateTime",inplace=True) + long_data["level"] = np.where(long_data.index.isin(short_data.index) == True, "wake", long_data["level"]) - longData.reset_index(inplace=True) + long_data.reset_index(inplace=True) - return longData.values.tolist() - -def classicData1min(data_summary): - dataList = list() - for data in data_summary['data']: - origEntry = data - counter = 0 - timeDuration = 60 - numberOfSplits = origEntry['seconds']//timeDuration - for times in range(numberOfSplits): - newRow = {'dateTime':dateutil.parser.parse(origEntry['dateTime'])+timedelta(seconds=counter*timeDuration),'level':origEntry['level'],'seconds':timeDuration} - dataList.append(newRow) - counter = counter + 1 - return dataList + return long_data.values.tolist() # Parse one record for sleep API version 1 -def parseOneRecordForV1(record, device_id, d_is_main_sleep, records_summary, records_intraday, fitbit_data_type): +def parseOneRecordForV1(record, device_id, type_episode_id, d_is_main_sleep, records_summary, records_intraday, fitbit_data_type): sleep_record_type = "classic" @@ -110,16 +97,16 @@ def parseOneRecordForV1(record, device_id, d_is_main_sleep, records_summary, rec d_original_level = SLEEP_CODE2LEVEL[int(data["value"])-1] - row_intraday = (device_id, + row_intraday = (type_episode_id, 60, d_original_level, -1, d_is_main_sleep, sleep_record_type, - d_datetime, 0) + d_datetime, 0, 0) records_intraday.append(row_intraday) return records_summary, records_intraday # Parse one record for sleep API version 1.2 -def parseOneRecordForV12(record, device_id, d_is_main_sleep, records_summary, records_intraday, fitbit_data_type): +def parseOneRecordForV12(record, device_id, type_episode_id, d_is_main_sleep, records_summary, records_intraday, fitbit_data_type): sleep_record_type = record['type'] @@ -138,52 +125,24 @@ def parseOneRecordForV12(record, device_id, d_is_main_sleep, records_summary, re # Intraday data if fitbit_data_type == "intraday": - if sleep_record_type == 'classic': - start_date = d_start_datetime.date() - end_date = d_end_datetime.date() - is_before_midnight = True - curr_date = start_date - data_summary = record['levels'] - dataSplitted = classicData1min(data_summary) ##Calling the function to split the data in regular 60 seconds interval - for data in dataSplitted: - # For overnight episodes, use end_date once we are over midnight - d_time = data["dateTime"].time() - if is_before_midnight and d_time.hour == 0: - curr_date = end_date - d_datetime = datetime.combine(curr_date, d_time) + if sleep_record_type == "classic": + for data in record["levels"]["data"]: + d_datetime = dateutil.parser.parse(data["dateTime"]) - d_original_level = data["level"] - - row_intraday = (device_id, - d_original_level, -1, d_is_main_sleep, sleep_record_type, - d_datetime, 0) + row_intraday = (type_episode_id, data["seconds"], + data["level"], -1, d_is_main_sleep, sleep_record_type, + d_datetime, 0, 0) records_intraday.append(row_intraday) else: # For sleep type "stages" - start_date = d_start_datetime.date() - end_date = d_end_datetime.date() - is_before_midnight = True - curr_date = start_date - data_summary = record['levels'] - dataList = mergeLongAndShortData(data_summary) - for data in dataList: - - d_time = data[0].time() - if is_before_midnight and d_time.hour == 0: - curr_date = end_date - d_datetime = datetime.combine(curr_date, d_time) - - d_original_level = data[1] - - row_intraday = (device_id, - d_original_level, -1, d_is_main_sleep, sleep_record_type, - d_datetime, 0) + for data in mergeLongAndShortData(record["levels"]): + row_intraday = (type_episode_id, 30, + data[1], -1, d_is_main_sleep, sleep_record_type, + data[0], 0, 0) records_intraday.append(row_intraday) return records_summary, records_intraday - - def parseSleepData(sleep_data, fitbit_data_type): SLEEP_SUMMARY_COLUMNS = SLEEP_SUMMARY_COLUMNS_V1_2 @@ -194,6 +153,7 @@ def parseSleepData(sleep_data, fitbit_data_type): return pd.DataFrame(columns=SLEEP_INTRADAY_COLUMNS) device_id = sleep_data["device_id"].iloc[0] records_summary, records_intraday = [], [] + type_episode_id = 0 # Parse JSON into individual records for multi_record in sleep_data.fitbit_data: for record in json.loads(multi_record)["sleep"]: @@ -203,11 +163,13 @@ def parseSleepData(sleep_data, fitbit_data_type): # For sleep API version 1 if "awakeCount" in record: SLEEP_SUMMARY_COLUMNS = SLEEP_SUMMARY_COLUMNS_V1 - records_summary, records_intraday = parseOneRecordForV1(record, device_id, d_is_main_sleep, records_summary, records_intraday, fitbit_data_type) + records_summary, records_intraday = parseOneRecordForV1(record, device_id, type_episode_id, d_is_main_sleep, records_summary, records_intraday, fitbit_data_type) # For sleep API version 1.2 else: SLEEP_SUMMARY_COLUMNS = SLEEP_SUMMARY_COLUMNS_V1_2 - records_summary, records_intraday = parseOneRecordForV12(record, device_id, d_is_main_sleep, records_summary, records_intraday, fitbit_data_type) + records_summary, records_intraday = parseOneRecordForV12(record, device_id, type_episode_id, d_is_main_sleep, records_summary, records_intraday, fitbit_data_type) + + type_episode_id = type_episode_id + 1 if fitbit_data_type == "summary": parsed_data = pd.DataFrame(data=records_summary, columns=SLEEP_SUMMARY_COLUMNS) @@ -216,6 +178,19 @@ def parseSleepData(sleep_data, fitbit_data_type): return parsed_data +def mergeSleepEpisodes(sleep_data, cols_for_groupby): + sleep_episodes = pd.DataFrame(columns=["type_episode_id", "level_episode_id", "level", "unified_level", "is_main_sleep", "type", "start_timestamp", "end_timestamp"]) + if not sleep_data.empty: + sleep_data = sleep_data.groupby(by=cols_for_groupby) + sleep_episodes = sleep_data[["start_timestamp"]].first() + sleep_episodes["end_timestamp"] = sleep_data["end_timestamp"].last() + + sleep_episodes.reset_index(inplace=True, drop=False) + + return sleep_episodes + + + timezone = snakemake.params["timezone"] column_format = snakemake.params["column_format"] fitbit_data_type = snakemake.params["fitbit_data_type"] @@ -237,6 +212,9 @@ elif column_format == "PLAIN_TEXT": else: raise ValueError("column_format can only be one of ['JSON', 'PLAIN_TEXT'].") +# Drop duplicates +parsed_data.drop_duplicates(inplace=True) + if parsed_data.shape[0] > 0 and fitbit_data_type == "summary": if sleep_episode_timestamp != "start" and sleep_episode_timestamp != "end": raise ValueError("SLEEP_EPISODE_TIMESTAMP can only be one of ['start', 'end'].") @@ -245,6 +223,10 @@ if parsed_data.shape[0] > 0 and fitbit_data_type == "summary": if not pd.isnull(local_start_date) and not pd.isnull(local_end_date): parsed_data = parsed_data.loc[(parsed_data[datetime_column] >= local_start_date) & (parsed_data[datetime_column] < local_end_date)] + + # Sort by "local_start_date_time" column + parsed_data.sort_values(by="local_start_date_time", ascending=True, inplace=True) + parsed_data["timestamp"] = parsed_data[datetime_column].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6 parsed_data.dropna(subset=['timestamp'], inplace=True) parsed_data.drop(["local_start_date_time", "local_end_date_time"], axis = 1, inplace=True) @@ -252,8 +234,18 @@ if parsed_data.shape[0] > 0 and fitbit_data_type == "summary": if parsed_data.shape[0] > 0 and fitbit_data_type == "intraday": if not pd.isnull(local_start_date) and not pd.isnull(local_end_date): parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)] - parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6 - parsed_data.dropna(subset=['timestamp'], inplace=True) - parsed_data["unified_level"] = np.where(parsed_data["level"].isin(["awake", "wake", "restless"]), 0, 1) + + # Sort by "local_date_time" column + parsed_data.sort_values(by="local_date_time", ascending=True, inplace=True) + + parsed_data["start_timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6 + parsed_data.dropna(subset=['start_timestamp'], inplace=True) + parsed_data["end_timestamp"] = parsed_data["start_timestamp"] + ((parsed_data["duration"] - 1) * 1000) + 999 + parsed_data["unified_level"] = np.where(parsed_data["level"].isin(["awake", "restless", "wake"]), 0, 1) + + # Put consecutive rows with the same "level" field together and merge episodes + parsed_data.insert(2, "level_episode_id", (parsed_data[["type_episode_id", "level"]] != parsed_data[["type_episode_id", "level"]].shift()).any(axis=1).cumsum()) + parsed_data = mergeSleepEpisodes(parsed_data, ["type_episode_id", "level_episode_id", "level", "unified_level", "is_main_sleep", "type"]) + parsed_data.to_csv(snakemake.output[0], index=False) diff --git a/src/features/fitbit_sleep_intraday/rapids/main.py b/src/features/fitbit_sleep_intraday/rapids/main.py new file mode 100644 index 00000000..f7e95599 --- /dev/null +++ b/src/features/fitbit_sleep_intraday/rapids/main.py @@ -0,0 +1,265 @@ +import pandas as pd +from datetime import datetime +import itertools + +def featuresFullNames(intraday_features_to_compute, sleep_levels_to_compute, sleep_types_to_compute, consider_all): + + features_fullname = ["local_segment"] + + sleep_level_with_group = [] + for sleep_level_group in sleep_levels_to_compute: + for sleep_level in sleep_levels_to_compute[sleep_level_group]: + sleep_level_with_group.append(sleep_level + sleep_level_group.lower()) + + if consider_all: + features_fullname.extend([x[0] + x[1] + x[2] for x in itertools.product(intraday_features_to_compute["LEVELS_AND_TYPES"], sleep_level_with_group + ["all"], sleep_types_to_compute + ["all"])]) + else: + features_fullname.extend([x[0] + x[1] + x[2] for x in itertools.product(intraday_features_to_compute["LEVELS_AND_TYPES"], sleep_level_with_group, sleep_types_to_compute)]) + if "ACROSS_LEVELS" in intraday_features_to_compute["RATIOS_SCOPE"]: + features_fullname.extend(["ratio" + x[0] + x[1] for x in itertools.product(intraday_features_to_compute["RATIOS_TYPE"], sleep_level_with_group)]) + if "ACROSS_TYPES" in intraday_features_to_compute["RATIOS_SCOPE"] and "main" in sleep_types_to_compute: + features_fullname.extend(["ratio" + x + "main" for x in intraday_features_to_compute["RATIOS_TYPE"]]) + if "WITHIN_LEVELS" in intraday_features_to_compute["RATIOS_SCOPE"]: + features_fullname.extend(["ratio" + x[0] + x[1] + "within" + x[2] for x in itertools.product(intraday_features_to_compute["RATIOS_TYPE"], sleep_types_to_compute, sleep_level_with_group)]) + if "WITHIN_TYPES" in intraday_features_to_compute["RATIOS_SCOPE"]: + features_fullname.extend(["ratio" + x[0] + x[1] + "within" + x[2] for x in itertools.product(intraday_features_to_compute["RATIOS_TYPE"], sleep_level_with_group, sleep_types_to_compute)]) + features_fullname.extend(intraday_features_to_compute["ROUTINE"]) + return features_fullname + +def mergeSleepEpisodes(sleep_data, cols_for_groupby): + + sleep_episodes = pd.DataFrame(columns=["local_segment", "duration", "start_timestamp", "end_timestamp", "local_start_date_time", "local_end_date_time", "start_minutes"]) + + if cols_for_groupby and (not sleep_data.empty): + sleep_data = sleep_data.groupby(by=cols_for_groupby) + sleep_episodes = sleep_data[["duration"]].sum() + sleep_episodes["start_timestamp"] = sleep_data["start_timestamp"].first() + sleep_episodes["end_timestamp"] = sleep_data["end_timestamp"].last() + sleep_episodes["local_start_date_time"] = sleep_data["local_start_date_time"].first() + sleep_episodes["local_end_date_time"] = sleep_data["local_end_date_time"].last() + sleep_episodes["start_minutes"] = sleep_data["start_minutes"].first() + + sleep_episodes.reset_index(inplace=True, drop=False) + + return sleep_episodes + +def statsFeatures(sleep_episodes, features, episode_type): + + episode_features = pd.DataFrame(columns=[feature + episode_type for feature in features]) + if sleep_episodes.empty: + return episode_features + + if "countepisode" in features: + episode_features["countepisode" + episode_type] = sleep_episodes[["local_segment", "duration"]].groupby(["local_segment"])["duration"].count() + if "sumduration" in features: + episode_features["sumduration" + episode_type] = sleep_episodes[["local_segment", "duration"]].groupby(["local_segment"])["duration"].sum() + if "maxduration" in features: + episode_features["maxduration" + episode_type] = sleep_episodes[["local_segment", "duration"]].groupby(["local_segment"])["duration"].max() + if "minduration" in features: + episode_features["minduration" + episode_type] = sleep_episodes[["local_segment", "duration"]].groupby(["local_segment"])["duration"].min() + if "avgduration" in features: + episode_features["avgduration" + episode_type] = sleep_episodes[["local_segment", "duration"]].groupby(["local_segment"])["duration"].mean() + if "medianduration" in features: + episode_features["medianduration" + episode_type] = sleep_episodes[["local_segment", "duration"]].groupby(["local_segment"])["duration"].median() + if "stdduration" in features: + episode_features["stdduration" + episode_type] = sleep_episodes[["local_segment", "duration"]].groupby(["local_segment"])["duration"].std() + + return episode_features + +def allStatsFeatures(sleep_data, base_sleep_levels, base_sleep_types, features, sleep_intraday_features): + + # For CLASSIC + for sleep_level, sleep_type in itertools.product(base_sleep_levels["CLASSIC"] + ["all"], base_sleep_types + ["all"]): + sleep_episodes_classic = sleep_data[sleep_data["is_main_sleep"] == (1 if sleep_type == "main" else 0)] if sleep_type != "all" else sleep_data + sleep_episodes_classic = sleep_episodes_classic[sleep_episodes_classic["level"] == sleep_level] if sleep_level != "all" else sleep_episodes_classic + sleep_intraday_features = pd.concat([sleep_intraday_features, statsFeatures(sleep_episodes_classic, features, sleep_level + "classic" + sleep_type)], axis=1) + + # For STAGES + for sleep_level, sleep_type in itertools.product(base_sleep_levels["STAGES"] + ["all"], base_sleep_types + ["all"]): + sleep_episodes_stages = sleep_data[sleep_data["is_main_sleep"] == (1 if sleep_type == "main" else 0)] if sleep_type != "all" else sleep_data + sleep_episodes_stages = sleep_episodes_stages[sleep_episodes_stages["level"] == sleep_level] if sleep_level != "all" else sleep_episodes_stages + sleep_intraday_features = pd.concat([sleep_intraday_features, statsFeatures(sleep_episodes_stages, features, sleep_level + "stages" + sleep_type)], axis=1) + + # For UNIFIED + for sleep_level, sleep_type in itertools.product(base_sleep_levels["UNIFIED"] + ["all"], base_sleep_types + ["all"]): + sleep_episodes_unified = sleep_data[sleep_data["is_main_sleep"] == (1 if sleep_type == "main" else 0)] if sleep_type != "all" else sleep_data + sleep_episodes_unified = sleep_episodes_unified[sleep_episodes_unified["unified_level"] == (0 if sleep_level == "awake" else 1)] if sleep_level != "all" else sleep_episodes_unified + sleep_episodes_unified = mergeSleepEpisodes(sleep_episodes_unified, ["local_segment", "unified_level_episode_id"]) + sleep_intraday_features = pd.concat([sleep_intraday_features, statsFeatures(sleep_episodes_unified, features, sleep_level + "unified" + sleep_type)], axis=1) + + # Ignore the levels (e.g. countepisode[all][main]) + for sleep_type in base_sleep_types + ["all"]: + sleep_episodes_none = sleep_data[sleep_data["is_main_sleep"] == (1 if sleep_type == "main" else 0)] if sleep_type != "all" else sleep_data + sleep_episodes_none = mergeSleepEpisodes(sleep_episodes_none, ["local_segment", "type_episode_id"]) + sleep_intraday_features = pd.concat([sleep_intraday_features, statsFeatures(sleep_episodes_none, features, "all" + sleep_type)], axis=1) + + return sleep_intraday_features + + +# Since all the stats features have been computed no matter they are requested or not, +# we can pick the related features to calculate the RATIOS features directly. +# Take ACROSS_LEVELS RATIOS features as an example: +# ratiocount[remstages] = countepisode[remstages][all] / countepisode[all][all] +def ratiosFeatures(sleep_intraday_features, ratios_types, ratios_scopes, sleep_levels, sleep_types): + + # Put sleep_level_group and sleep_level together. + # For example: + # input (sleep_levels): {"CLASSIC": ["awake", "restless", "asleep"], "UNIFIED": ["awake", "asleep"]} + # output (sleep_level_with_group): [("classic", "awake"), ("classic", "restless"), ("classic", "asleep"), ("unified", "awake"), ("unified", "asleep")] + sleep_level_with_group = [] + for sleep_level_group in sleep_levels: + for sleep_level in sleep_levels[sleep_level_group]: + sleep_level_with_group.append((sleep_level_group.lower(), sleep_level)) + + # ACROSS LEVELS + if "ACROSS_LEVELS" in ratios_scopes: + # Get the cross product of ratios_types and sleep_level_with_group. + # For example: + # input: ratios_types is ["count", "duration"], sleep_level_with_group is [("classic", "awake"), ("classic", "restless"), ("unified", "asleep")] + # output: + # 1) ratios_type: "count", sleep_levels_combined: ("classic", "awake") + # 2) ratios_type: "count", sleep_levels_combined: ("classic", "restless") + # 3) ratios_type: "count", sleep_levels_combined: ("unified", "asleep") + # 4) ratios_type: "duration", sleep_levels_combined: ("classic", "awake") + # 5) ratios_type: "duration", sleep_levels_combined: ("classic", "restless") + # 6) ratios_type: "duration", sleep_levels_combined: ("unified", "asleep") + for ratios_type, sleep_levels_combined in itertools.product(ratios_types, sleep_level_with_group): + sleep_level_group, sleep_level = sleep_levels_combined[0], sleep_levels_combined[1] + agg_func = "countepisode" if ratios_type == "count" else "sumduration" + across_levels = (sleep_intraday_features[agg_func + sleep_level + sleep_level_group + "all"] / sleep_intraday_features[agg_func + "all" + sleep_level_group + "all"]).to_frame().rename(columns={0: "ratio" + ratios_type + sleep_level + sleep_level_group}) + sleep_intraday_features = pd.concat([sleep_intraday_features, across_levels], axis=1) + + # ACROSS TYPES + if "ACROSS_TYPES" in ratios_scopes: + for ratios_type in ratios_types: + agg_func = "countepisode" if ratios_type == "count" else "sumduration" + across_types = (sleep_intraday_features[agg_func + "allmain"] / sleep_intraday_features[agg_func + "allall"]).to_frame().rename(columns={0: "ratio" + ratios_type + "main"}) + sleep_intraday_features = pd.concat([sleep_intraday_features, across_types], axis=1) + + # Get the cross product of ratios_types, sleep_level_with_group, and sleep_types. + # For example: + # input: + # ratios_types is ["count", "duration"] + # sleep_level_with_group is [("classic", "awake"), ("unified", "asleep")] + # sleep_types is ["main", "nap"] + # output: + # 1) ratios_type: "count", sleep_levels_combined: ("classic", "awake"), sleep_type: "main" + # 2) ratios_type: "count", sleep_levels_combined: ("classic", "awake"), sleep_type: "nap" + # 3) ratios_type: "count", sleep_levels_combined: ("unified", "asleep"), sleep_type: "main" + # 4) ratios_type: "count", sleep_levels_combined: ("unified", "asleep"), sleep_type: "nap" + # 5) ratios_type: "duration", sleep_levels_combined: ("classic", "awake"), sleep_type: "main" + # 6) ratios_type: "duration", sleep_levels_combined: ("classic", "awake"), sleep_type: "nap" + # 7) ratios_type: "duration", sleep_levels_combined: ("unified", "asleep"), sleep_type: "main" + # 8) ratios_type: "duration", sleep_levels_combined: ("unified", "asleep"), sleep_type: "nap" + for ratios_type, sleep_levels_combined, sleep_type in itertools.product(ratios_types, sleep_level_with_group, sleep_types): + sleep_level_group, sleep_level = sleep_levels_combined[0], sleep_levels_combined[1] + agg_func = "countepisode" if ratios_type == "count" else "sumduration" + + # WITHIN LEVELS + if "WITHIN_LEVELS" in ratios_scopes: + within_levels = (sleep_intraday_features[agg_func + sleep_level + sleep_level_group + sleep_type] / sleep_intraday_features[agg_func + sleep_level + sleep_level_group + "all"]).to_frame().rename(columns={0: "ratio" + ratios_type + sleep_type + "within" + sleep_level + sleep_level_group}) + sleep_intraday_features = pd.concat([sleep_intraday_features, within_levels], axis=1) + + # WITHIN TYPES + if "WITHIN_TYPES" in ratios_scopes: + within_types = (sleep_intraday_features[agg_func + sleep_level + sleep_level_group + sleep_type] / sleep_intraday_features[agg_func + "all" + sleep_level_group + sleep_type]).to_frame().rename(columns={0: "ratio" + ratios_type + sleep_level + sleep_level_group + "within" + sleep_type}) + sleep_intraday_features = pd.concat([sleep_intraday_features, within_types], axis=1) + + return sleep_intraday_features + + +def singleSleepTypeRoutineFeatures(sleep_intraday_data, routine, reference_time, sleep_type, sleep_intraday_features): + + sleep_intraday_data = sleep_intraday_data[sleep_intraday_data["is_main_sleep"] == (1 if sleep_type == "mainsleep" else 0)] + if "starttimefirst" + sleep_type in routine: + grouped_first = sleep_intraday_data.groupby(["local_segment"]).first() + if reference_time == "MIDNIGHT": + sleep_intraday_features["starttimefirst" + sleep_type] = grouped_first["start_minutes"] + elif reference_time == "START_OF_THE_SEGMENT": + sleep_intraday_features["starttimefirst" + sleep_type] = (grouped_first["start_timestamp"] - grouped_first["segment_start_timestamp"]) / (60 * 1000) + else: + raise ValueError("Please check FITBIT_SLEEP_INTRADAY section of config.yaml: REFERENCE_TIME can only be MIDNIGHT or START_OF_THE_SEGMENT.") + + if "endtimelast" + sleep_type in routine: + grouped_last = sleep_intraday_data.groupby(["local_segment"]).last() + if reference_time == "MIDNIGHT": + sleep_intraday_features["endtimelast" + sleep_type] = grouped_last["local_end_date_time"].apply(lambda x: x.hour * 60 + x.minute + x.second / 60) + elif reference_time == "START_OF_THE_SEGMENT": + sleep_intraday_features["endtimelast" + sleep_type] = (grouped_last["end_timestamp"] - grouped_last["segment_start_timestamp"]) / (60 * 1000) + else: + raise ValueError("Please check FITBIT_SLEEP_INTRADAY section of config.yaml: REFERENCE_TIME can only be MIDNIGHT or START_OF_THE_SEGMENT.") + + return sleep_intraday_features + +def routineFeatures(sleep_intraday_data, routine, reference_time, sleep_type, sleep_intraday_features): + + if "starttimefirstmainsleep" in routine or "endtimelastmainsleep" in routine: + sleep_intraday_features = singleSleepTypeRoutineFeatures(sleep_intraday_data, routine, reference_time, "mainsleep", sleep_intraday_features) + + if "starttimefirstnap" in routine or "endtimelastnap" in routine: + sleep_intraday_features = singleSleepTypeRoutineFeatures(sleep_intraday_data, routine, reference_time, "nap", sleep_intraday_features) + + return sleep_intraday_features + + +def rapids_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): + + sleep_intraday_data = pd.read_csv(sensor_data_files["sensor_data"]) + + consider_all = provider["FEATURES"]["LEVELS_AND_TYPES_COMBINING_ALL"] + include_sleep_later_than = provider["INCLUDE_SLEEP_LATER_THAN"] + reference_time = provider["REFERENCE_TIME"] + + requested_intraday_features = provider["FEATURES"] + requested_sleep_levels = provider["SLEEP_LEVELS"] + requested_sleep_types = provider["SLEEP_TYPES"] + + # Name of the features this function can compute + base_intraday_features = {"LEVELS_AND_TYPES": ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"], + "RATIOS_TYPE": ["count", "duration"], + "RATIOS_SCOPE": ["ACROSS_LEVELS", "ACROSS_TYPES", "WITHIN_LEVELS", "WITHIN_TYPES"], + "ROUTINE": ["starttimefirstmainsleep", "endtimelastmainsleep", "starttimefirstnap", "endtimelastnap"]} + base_sleep_levels = {"CLASSIC": ["awake", "restless", "asleep"], + "STAGES": ["wake", "deep", "light", "rem"], + "UNIFIED": ["awake", "asleep"]} + base_sleep_types = ["main", "nap"] + + # The subset of requested features this function can compute + intraday_features_to_compute = {key: list(set(requested_intraday_features[key]) & set(base_intraday_features[key])) for key in requested_intraday_features if key in base_intraday_features} + sleep_levels_to_compute = {key: list(set(requested_sleep_levels[key]) & set(base_sleep_levels[key])) for key in requested_sleep_levels if key in base_sleep_levels} + sleep_types_to_compute = list(set(requested_sleep_types) & set(base_sleep_types)) + + # Full names + features_fullnames = featuresFullNames(intraday_features_to_compute, sleep_levels_to_compute, sleep_types_to_compute, consider_all) + sleep_intraday_features = pd.DataFrame(columns=features_fullnames) + + # Include sleep later than + start_minutes = sleep_intraday_data.groupby("start_timestamp").first()["local_time"].apply(lambda x: int(x.split(":")[0]) * 60 + int(x.split(":")[1]) + int(x.split(":")[2]) / 60).to_frame().rename(columns={"local_time": "start_minutes"}).reset_index() + sleep_intraday_data = sleep_intraday_data.merge(start_minutes, on="start_timestamp", how="left") + sleep_intraday_data = sleep_intraday_data[sleep_intraday_data["start_minutes"] >= include_sleep_later_than] + + sleep_intraday_data = filter_data_by_segment(sleep_intraday_data, time_segment) + + # While level_episode_id is based on levels provided by Fitbit (classic & stages), unified_level_episode_id is based on unified_level. + sleep_intraday_data.insert(3, "unified_level_episode_id", (sleep_intraday_data[["type_episode_id", "unified_level"]] != sleep_intraday_data[["type_episode_id", "unified_level"]].shift()).any(axis=1).cumsum()) + + if not sleep_intraday_data.empty: + + sleep_intraday_features = pd.DataFrame() + + # ALL LEVELS AND TYPES: compute all stats features no matter they are requested or not + sleep_intraday_features = allStatsFeatures(sleep_intraday_data, base_sleep_levels, base_sleep_types, base_intraday_features["LEVELS_AND_TYPES"], sleep_intraday_features) + + # RATIOS: only compute requested features + sleep_intraday_features = ratiosFeatures(sleep_intraday_features, intraday_features_to_compute["RATIOS_TYPE"], intraday_features_to_compute["RATIOS_SCOPE"], sleep_levels_to_compute, sleep_types_to_compute) + + # ROUTINE: only compute requested features + sleep_intraday_features = routineFeatures(sleep_intraday_data, intraday_features_to_compute["ROUTINE"], reference_time, sleep_types_to_compute, sleep_intraday_features) + + # Reset index and discard features which are not requested by user + sleep_intraday_features.index.name = "local_segment" + sleep_intraday_features.reset_index(inplace=True) + sleep_intraday_features = sleep_intraday_features[features_fullnames] + + + return sleep_intraday_features diff --git a/src/features/utils/utils.py b/src/features/utils/utils.py index b0399de8..1ea86222 100644 --- a/src/features/utils/utils.py +++ b/src/features/utils/utils.py @@ -55,7 +55,7 @@ def chunk_episodes(sensor_episodes): sensor_episodes["duration"] = (sensor_episodes["chunked_end_timestamp"] - sensor_episodes["chunked_start_timestamp"]) / (1000 * 60) # Merge episodes - cols_for_groupby = [col for col in sensor_episodes.columns if col not in ["timestamps_segment", "timestamp", "assigned_segments", "start_datetime", "end_datetime", "start_timestamp", "end_timestamp", "duration", "segment_start_timestamp", "segment_end_timestamp", "chunked_start_timestamp", "chunked_end_timestamp"]] + cols_for_groupby = [col for col in sensor_episodes.columns if col not in ["timestamps_segment", "timestamp", "assigned_segments", "start_datetime", "end_datetime", "start_timestamp", "end_timestamp", "duration", "chunked_start_timestamp", "chunked_end_timestamp"]] sensor_episodes_grouped = sensor_episodes.groupby(by=cols_for_groupby) merged_sensor_episodes = sensor_episodes_grouped[["duration"]].sum()