From 614e759551d3c9c7604ca7737e79b6087da61402 Mon Sep 17 00:00:00 2001 From: JulioV Date: Wed, 2 Dec 2020 18:41:03 -0500 Subject: [PATCH] Refactor day segments to time segments --- .github/workflows/docs.yaml | 2 +- .gitignore | 2 +- .travis.yml | 2 +- config.yaml | 6 +- ...s_default.csv => timesegments_default.csv} | 0 ...ments_event.csv => timesegments_event.csv} | 0 ...equency.csv => timesegments_frequency.csv} | 0 ...periodic.csv => timesegments_periodic.csv} | 0 docs/features/add-new-features.md | 24 +- docs/features/fitbit-heartrate-intraday.md | 24 +- docs/features/fitbit-heartrate-summary.md | 34 +-- docs/features/fitbit-sleep-summary.md | 26 +-- docs/features/fitbit-steps-intraday.md | 40 ++-- docs/features/fitbit-steps-summary.md | 12 +- docs/features/phone-accelerometer.md | 8 +- docs/features/phone-activity-recognition.md | 4 +- .../features/phone-applications-foreground.md | 10 +- docs/features/phone-battery.md | 4 +- docs/features/phone-bluetooth.md | 10 +- docs/features/phone-calls.md | 30 +-- docs/features/phone-conversation.md | 8 +- docs/features/phone-data-yield.md | 16 +- docs/features/phone-light.md | 4 +- docs/features/phone-locations.md | 22 +- docs/features/phone-messages.md | 14 +- docs/features/phone-screen.md | 4 +- docs/features/phone-wifi-connected.md | 10 +- docs/features/phone-wifi-visible.md | 10 +- docs/file-structure.md | 4 +- docs/index.md | 6 +- docs/setup/configuration.md | 32 +-- docs/setup/execution.md | 4 +- docs/workflow-examples/minimal.md | 8 +- example_profile/example_config.yaml | 6 +- ...s.csv => exampleworkflow_timesegments.csv} | 0 rules/features.smk | 76 +++--- rules/models.smk | 8 +- rules/preprocessing.smk | 42 ++-- rules/reports.smk | 2 +- sn_profile_rapids/Snakefile | 66 +++--- sn_profile_rapids/config.yaml | 2 +- sn_profile_rapids/pipeline_config.yaml | 2 +- ...day_segment.R => assign_to_time_segment.R} | 52 ++--- src/data/compute_day_segments.py | 216 ------------------ src/data/compute_time_segments.py | 216 ++++++++++++++++++ src/data/phone_sensed_bins.R | 40 ---- src/data/phone_valid_sensed_days.R | 18 -- src/data/readable_datetime.R | 18 +- src/features/entry.R | 6 +- src/features/entry.py | 6 +- .../fitbit_heartrate_intraday/rapids/main.py | 8 +- .../fitbit_heartrate_summary/rapids/main.py | 6 +- .../fitbit_sleep_summary/rapids/main.py | 6 +- .../fitbit_steps_intraday/rapids/main.py | 4 +- .../fitbit_steps_summary/rapids/main.py | 6 +- .../phone_accelerometer/panda/main.py | 4 +- .../phone_accelerometer/rapids/main.py | 4 +- .../phone_activity_recognition/rapids/main.py | 4 +- .../rapids/main.py | 14 +- src/features/phone_battery/rapids/main.py | 4 +- src/features/phone_bluetooth/rapids/main.R | 8 +- src/features/phone_calls/rapids/main.R | 10 +- .../phone_conversation/rapids/main.py | 4 +- src/features/phone_data_yield/rapids/main.R | 8 +- src/features/phone_light/rapids/main.py | 4 +- src/features/phone_locations/barnett/main.R | 12 +- src/features/phone_locations/doryab/main.py | 6 +- src/features/phone_messages/rapids/main.R | 10 +- src/features/phone_screen/rapids/main.py | 8 +- .../phone_wifi_connected/rapids/main.R | 8 +- src/features/phone_wifi_visible/rapids/main.R | 8 +- src/features/utils/utils.R | 24 +- src/features/utils/utils.py | 18 +- src/models/workflow_example/parse_targets.py | 6 +- tests/scripts/utils.py | 14 +- tests/settings/frequency/config.yaml | 2 +- tests/settings/frequency/testing_config.yaml | 8 +- tests/settings/periodic/config.yaml | 2 +- tests/settings/periodic/testing_config.yaml | 8 +- 79 files changed, 663 insertions(+), 721 deletions(-) rename data/external/{daysegments_default.csv => timesegments_default.csv} (100%) rename data/external/{daysegments_event.csv => timesegments_event.csv} (100%) rename data/external/{daysegments_frequency.csv => timesegments_frequency.csv} (100%) rename data/external/{daysegments_periodic.csv => timesegments_periodic.csv} (100%) rename example_profile/{exampleworkflow_daysegments.csv => exampleworkflow_timesegments.csv} (100%) rename src/data/{assign_to_day_segment.R => assign_to_time_segment.R} (84%) delete mode 100644 src/data/compute_day_segments.py create mode 100644 src/data/compute_time_segments.py delete mode 100644 src/data/phone_sensed_bins.R delete mode 100644 src/data/phone_valid_sensed_days.R diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 80d7fb43..a50cf292 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -2,7 +2,7 @@ name: docs on: push: branches: - - day_segments + - time_segments jobs: deploy: runs-on: ubuntu-latest diff --git a/.gitignore b/.gitignore index d0920998..60899167 100644 --- a/.gitignore +++ b/.gitignore @@ -95,7 +95,7 @@ packrat/* data/external/* !/data/external/.gitkeep !/data/external/stachl_application_genre_catalogue.csv -!/data/external/daysegments*.csv +!/data/external/timesegments*.csv data/raw/* !/data/raw/.gitkeep data/interim/* diff --git a/.travis.yml b/.travis.yml index 98421603..98993d3e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -52,7 +52,7 @@ jobs: branches: only: - master - - day_segment + - time_segment stages: - name: deploy diff --git a/config.yaml b/config.yaml index 8af1d996..1aef391f 100644 --- a/config.yaml +++ b/config.yaml @@ -25,10 +25,10 @@ CREATE_PARTICIPANT_FILES: DEVICE_ID_COLUMN: device_id # column name IGNORED_DEVICE_IDS: [] -# See https://www.rapids.science/latest/setup/configuration/#day-segments -DAY_SEGMENTS: &day_segments +# See https://www.rapids.science/latest/setup/configuration/#time-segments +TIME_SEGMENTS: &time_segments TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT - FILE: "data/external/daysegments_periodic.csv" + FILE: "data/external/timesegments_periodic.csv" INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, see docs diff --git a/data/external/daysegments_default.csv b/data/external/timesegments_default.csv similarity index 100% rename from data/external/daysegments_default.csv rename to data/external/timesegments_default.csv diff --git a/data/external/daysegments_event.csv b/data/external/timesegments_event.csv similarity index 100% rename from data/external/daysegments_event.csv rename to data/external/timesegments_event.csv diff --git a/data/external/daysegments_frequency.csv b/data/external/timesegments_frequency.csv similarity index 100% rename from data/external/daysegments_frequency.csv rename to data/external/timesegments_frequency.csv diff --git a/data/external/daysegments_periodic.csv b/data/external/timesegments_periodic.csv similarity index 100% rename from data/external/daysegments_periodic.csv rename to data/external/timesegments_periodic.csv diff --git a/docs/features/add-new-features.md b/docs/features/add-new-features.md index 1b8a6071..3f1dba8d 100644 --- a/docs/features/add-new-features.md +++ b/docs/features/add-new-features.md @@ -83,12 +83,12 @@ In this step you need to add a folder, script and function for your provider. !!! info "Python function" ```python - def [providername]_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): + def [providername]_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): ``` !!! info "R function" ```r - [providername]_features <- function(sensor_data, day_segment, provider) + [providername]_features <- function(sensor_data, time_segment, provider) ``` ### Implement your feature extraction code @@ -98,7 +98,7 @@ The provider function that you created in the step above will receive the follow | Parameter                                       | Description |---|---| |`sensor_data_files`| Path to the CSV file containing the data of a single participant. This data has been cleaned and preprocessed. Your function will be automatically called for each participant in your study (in the `[PIDS]` array in `config.yaml`) -|`day_segment`| The label of the day segment that should be processed. +|`time_segment`| The label of the time segment that should be processed. |`provider`| The parameters you configured for your provider in `config.yaml` will be available in this variable as a dictionary in Python or a list in R. In our example this dictionary contains `{MY_PARAMETER:"a_string"}` |`filter_data_by_segment`| Python only. A function that you will use to filter your data. In R this function is already available in the environment. |`*args`| Python only. Not used for now @@ -115,24 +115,24 @@ The code to extract your behavioral features should be implemented in your provi Note that phone's battery, screen, and activity recognition data is given as episodes instead of event rows (for example, start and end timestamps of the periods the phone screen was on) -??? info "2. Filter your data to process only those rows that belong to `day_segment`" +??? info "2. Filter your data to process only those rows that belong to `time_segment`" This step is only one line of code, but to undersand why we need it, keep reading. ```python - acc_data = filter_data_by_segment(acc_data, day_segment) + acc_data = filter_data_by_segment(acc_data, time_segment) ``` - You should use the `filter_data_by_segment()` function to process and group those rows that belong to each of the [day segments RAPIDS could be configured with](../../setup/configuration/#day-segments). + You should use the `filter_data_by_segment()` function to process and group those rows that belong to each of the [time segments RAPIDS could be configured with](../../setup/configuration/#time-segments). - Let's understand the `filter_data_by_segment()` function with an example. A RAPIDS user can extract features on any arbitrary [day segment](../../setup/configuration/#day-segments). A day segment is a period of time that has a label and one or more instances. For example, the user (or you) could have requested features on a daily, weekly, and week-end basis for `p01`. The labels are arbritrary and the instances depend on the days a participant was monitored for: + Let's understand the `filter_data_by_segment()` function with an example. A RAPIDS user can extract features on any arbitrary [time segment](../../setup/configuration/#time-segments). A time segment is a period of time that has a label and one or more instances. For example, the user (or you) could have requested features on a daily, weekly, and week-end basis for `p01`. The labels are arbritrary and the instances depend on the days a participant was monitored for: - the daily segment could be named `my_days` and if `p01` was monitored for 14 days, it would have 14 instances - the weekly segment could be named `my_weeks` and if `p01` was monitored for 14 days, it would have 2 instances. - the weekend segment could be named `my_weekends` and if `p01` was monitored for 14 days, it would have 2 instances. - For this example, RAPIDS will call your provider function three times for `p01`, once where `day_segment` is `my_days`, once where `day_segment` is `my_weeks` and once where `day_segment` is `my_weekends`. In this example not every row in `p01`'s data needs to take part in the feature computation for either segment **and** the rows need to be grouped differently. + For this example, RAPIDS will call your provider function three times for `p01`, once where `time_segment` is `my_days`, once where `time_segment` is `my_weeks` and once where `time_segment` is `my_weekends`. In this example not every row in `p01`'s data needs to take part in the feature computation for either segment **and** the rows need to be grouped differently. - Thus `filter_data_by_segment()` comes in handy, it will return a data frame that contains the rows that were logged during a day segment plus an extra column called `local_segment`. This new column will have as many unique values as day segment instances exist (14, 2, and 2 for our `p01`'s `my_days`, `my_weeks`, and `my_weekends` examples). After filtering, **you should group the data frame by this column and compute any desired features**, for example: + Thus `filter_data_by_segment()` comes in handy, it will return a data frame that contains the rows that were logged during a time segment plus an extra column called `local_segment`. This new column will have as many unique values as time segment instances exist (14, 2, and 2 for our `p01`'s `my_days`, `my_weeks`, and `my_weekends` examples). After filtering, **you should group the data frame by this column and compute any desired features**, for example: ```python acc_features["maxmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].max() @@ -143,7 +143,7 @@ The code to extract your behavioral features should be implemented in your provi ??? info "3. Return a data frame with your features" After filtering, grouping your data, and computing your features, your provider function should return a data frame that has: - - One row per day segment instance (e.g. 14 our `p01`'s `my_days` example) + - One row per time segment instance (e.g. 14 our `p01`'s `my_days` example) - The `local_segment` column added by `filter_data_by_segment()` - One column per feature. By convention the name of your features should only contain letters or numbers (`feature1`). RAPIDS will automatically add the right sensor and provider prefix (`phone_accelerometr_vega_`) @@ -151,7 +151,7 @@ The code to extract your behavioral features should be implemented in your provi For your reference, this a short example of our own provider (`RAPIDS`) for `PHONE_ACCELEROMETER` that computes five acceleration features ```python - def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): + def rapids_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): acc_data = pd.read_csv(sensor_data_files["sensor_data"]) requested_features = provider["FEATURES"] @@ -162,7 +162,7 @@ The code to extract your behavioral features should be implemented in your provi acc_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) if not acc_data.empty: - acc_data = filter_data_by_segment(acc_data, day_segment) + acc_data = filter_data_by_segment(acc_data, time_segment) if not acc_data.empty: acc_features = pd.DataFrame() diff --git a/docs/features/fitbit-heartrate-intraday.md b/docs/features/fitbit-heartrate-intraday.md index bafb8944..3dcedc14 100644 --- a/docs/features/fitbit-heartrate-intraday.md +++ b/docs/features/fitbit-heartrate-intraday.md @@ -31,8 +31,8 @@ We provide examples of the input format that RAPIDS expects, note that both exam ## RAPIDS provider -!!! info "Available day segments" - - Available for all day segments +!!! info "Available time segments" + - Available for all time segments !!! info "File Sequence" ```bash @@ -56,16 +56,16 @@ Features description for `[FITBIT_HEARTRATE_INTRADAY][PROVIDERS][RAPIDS]`: |Feature |Units |Description| |-------------------------- |-------------- |---------------------------| -|maxhr |beats/mins |The maximum heart rate during a day segment. -|minhr |beats/mins |The minimum heart rate during a day segment. -|avghr |beats/mins |The average heart rate during a day segment. -|medianhr |beats/mins |The median of heart rate during a day segment. -|modehr |beats/mins |The mode of heart rate during a day segment. -|stdhr |beats/mins |The standard deviation of heart rate during a day segment. -|diffmaxmodehr |beats/mins |The difference between the maximum and mode heart rate during a day segment. -|diffminmodehr |beats/mins |The difference between the mode and minimum heart rate during a day segment. -|entropyhr |nats |Shannon’s entropy measurement based on heart rate during a day segment. -|minutesonZONE |minutes |Number of minutes the user’s heart rate fell within each `heartrate_zone` during a day segment. +|maxhr |beats/mins |The maximum heart rate during a time segment. +|minhr |beats/mins |The minimum heart rate during a time segment. +|avghr |beats/mins |The average heart rate during a time segment. +|medianhr |beats/mins |The median of heart rate during a time segment. +|modehr |beats/mins |The mode of heart rate during a time segment. +|stdhr |beats/mins |The standard deviation of heart rate during a time segment. +|diffmaxmodehr |beats/mins |The difference between the maximum and mode heart rate during a time segment. +|diffminmodehr |beats/mins |The difference between the mode and minimum heart rate during a time segment. +|entropyhr |nats |Shannon’s entropy measurement based on heart rate during a time segment. +|minutesonZONE |minutes |Number of minutes the user’s heart rate fell within each `heartrate_zone` during a time segment. !!! note "Assumptions/Observations" diff --git a/docs/features/fitbit-heartrate-summary.md b/docs/features/fitbit-heartrate-summary.md index 553421c1..9392dea6 100644 --- a/docs/features/fitbit-heartrate-summary.md +++ b/docs/features/fitbit-heartrate-summary.md @@ -31,7 +31,7 @@ We provide examples of the input format that RAPIDS expects, note that both exam ## RAPIDS provider -!!! info "Available day segments" +!!! info "Available time segments" - Only available for segments that span 1 or more complete days (e.g. Jan 1st 00:00 to Jan 3rd 23:59) !!! info "File Sequence" @@ -56,22 +56,22 @@ Features description for `[FITBIT_HEARTRATE_SUMMARY][PROVIDERS][RAPIDS]`: |Feature |Units |Description| |-------------------------- |---------- |---------------------------| -|maxrestinghr |beats/mins |The maximum daily resting heart rate during a day segment. -|minrestinghr |beats/mins |The minimum daily resting heart rate during a day segment. -|avgrestinghr |beats/mins |The average daily resting heart rate during a day segment. -|medianrestinghr |beats/mins |The median of daily resting heart rate during a day segment. -|moderestinghr |beats/mins |The mode of daily resting heart rate during a day segment. -|stdrestinghr |beats/mins |The standard deviation of daily resting heart rate during a day segment. -|diffmaxmoderestinghr |beats/mins |The difference between the maximum and mode daily resting heart rate during a day segment. -|diffminmoderestinghr |beats/mins |The difference between the mode and minimum daily resting heart rate during a day segment. -|entropyrestinghr |nats |Shannon’s entropy measurement based on daily resting heart rate during a day segment. -|sumcaloriesZONE |cals |The total daily calories burned within `heartrate_zone` during a day segment. -|maxcaloriesZONE |cals |The maximum daily calories burned within `heartrate_zone` during a day segment. -|mincaloriesZONE |cals |The minimum daily calories burned within `heartrate_zone` during a day segment. -|avgcaloriesZONE |cals |The average daily calories burned within `heartrate_zone` during a day segment. -|mediancaloriesZONE |cals |The median of daily calories burned within `heartrate_zone` during a day segment. -|stdcaloriesZONE |cals |The standard deviation of daily calories burned within `heartrate_zone` during a day segment. -|entropycaloriesZONE |nats |Shannon’s entropy measurement based on daily calories burned within `heartrate_zone` during a day segment. +|maxrestinghr |beats/mins |The maximum daily resting heart rate during a time segment. +|minrestinghr |beats/mins |The minimum daily resting heart rate during a time segment. +|avgrestinghr |beats/mins |The average daily resting heart rate during a time segment. +|medianrestinghr |beats/mins |The median of daily resting heart rate during a time segment. +|moderestinghr |beats/mins |The mode of daily resting heart rate during a time segment. +|stdrestinghr |beats/mins |The standard deviation of daily resting heart rate during a time segment. +|diffmaxmoderestinghr |beats/mins |The difference between the maximum and mode daily resting heart rate during a time segment. +|diffminmoderestinghr |beats/mins |The difference between the mode and minimum daily resting heart rate during a time segment. +|entropyrestinghr |nats |Shannon’s entropy measurement based on daily resting heart rate during a time segment. +|sumcaloriesZONE |cals |The total daily calories burned within `heartrate_zone` during a time segment. +|maxcaloriesZONE |cals |The maximum daily calories burned within `heartrate_zone` during a time segment. +|mincaloriesZONE |cals |The minimum daily calories burned within `heartrate_zone` during a time segment. +|avgcaloriesZONE |cals |The average daily calories burned within `heartrate_zone` during a time segment. +|mediancaloriesZONE |cals |The median of daily calories burned within `heartrate_zone` during a time segment. +|stdcaloriesZONE |cals |The standard deviation of daily calories burned within `heartrate_zone` during a time segment. +|entropycaloriesZONE |nats |Shannon’s entropy measurement based on daily calories burned within `heartrate_zone` during a time segment. !!! note "Assumptions/Observations" diff --git a/docs/features/fitbit-sleep-summary.md b/docs/features/fitbit-sleep-summary.md index c9bc8064..9ac79357 100644 --- a/docs/features/fitbit-sleep-summary.md +++ b/docs/features/fitbit-sleep-summary.md @@ -51,7 +51,7 @@ We provide examples of the input format that RAPIDS expects, note that both exam ## RAPIDS provider -!!! info "Available day segments" +!!! info "Available time segments" - Only available for segments that span 1 or more complete days (e.g. Jan 1st 00:00 to Jan 3rd 23:59) !!! info "File Sequence" @@ -77,18 +77,18 @@ Features description for `[FITBIT_SLEEP_SUMMARY][PROVIDERS][RAPIDS]`: |Feature |Units |Description | |------------------------------ |---------- |-------------------------------------------- | -|countepisodeTYPE |episodes |Number of sleep episodes for a certain sleep type during a day segment. -|avgefficiencyTYPE |scores |Average sleep efficiency for a certain sleep type during a day segment. -|sumdurationafterwakeupTYPE |minutes |Total duration the user stayed in bed after waking up for a certain sleep type during a day segment. -|sumdurationasleepTYPE |minutes |Total sleep duration for a certain sleep type during a day segment. -|sumdurationawakeTYPE |minutes |Total duration the user stayed awake but still in bed for a certain sleep type during a day segment. -|sumdurationtofallasleepTYPE |minutes |Total duration the user spent to fall asleep for a certain sleep type during a day segment. -|sumdurationinbedTYPE |minutes |Total duration the user stayed in bed (sumdurationtofallasleep + sumdurationawake + sumdurationasleep + sumdurationafterwakeup) for a certain sleep type during a day segment. -|avgdurationafterwakeupTYPE |minutes |Average duration the user stayed in bed after waking up for a certain sleep type during a day segment. -|avgdurationasleepTYPE |minutes |Average sleep duration for a certain sleep type during a day segment. -|avgdurationawakeTYPE |minutes |Average duration the user stayed awake but still in bed for a certain sleep type during a day segment. -|avgdurationtofallasleepTYPE |minutes |Average duration the user spent to fall asleep for a certain sleep type during a day segment. -|avgdurationinbedTYPE |minutes |Average duration the user stayed in bed (sumdurationtofallasleep + sumdurationawake + sumdurationasleep + sumdurationafterwakeup) for a certain sleep type during a day segment. +|countepisodeTYPE |episodes |Number of sleep episodes for a certain sleep type during a time segment. +|avgefficiencyTYPE |scores |Average sleep efficiency for a certain sleep type during a time segment. +|sumdurationafterwakeupTYPE |minutes |Total duration the user stayed in bed after waking up for a certain sleep type during a time segment. +|sumdurationasleepTYPE |minutes |Total sleep duration for a certain sleep type during a time segment. +|sumdurationawakeTYPE |minutes |Total duration the user stayed awake but still in bed for a certain sleep type during a time segment. +|sumdurationtofallasleepTYPE |minutes |Total duration the user spent to fall asleep for a certain sleep type during a time segment. +|sumdurationinbedTYPE |minutes |Total duration the user stayed in bed (sumdurationtofallasleep + sumdurationawake + sumdurationasleep + sumdurationafterwakeup) for a certain sleep type during a time segment. +|avgdurationafterwakeupTYPE |minutes |Average duration the user stayed in bed after waking up for a certain sleep type during a time segment. +|avgdurationasleepTYPE |minutes |Average sleep duration for a certain sleep type during a time segment. +|avgdurationawakeTYPE |minutes |Average duration the user stayed awake but still in bed for a certain sleep type during a time segment. +|avgdurationtofallasleepTYPE |minutes |Average duration the user spent to fall asleep for a certain sleep type during a time segment. +|avgdurationinbedTYPE |minutes |Average duration the user stayed in bed (sumdurationtofallasleep + sumdurationawake + sumdurationasleep + sumdurationafterwakeup) for a certain sleep type during a time segment. diff --git a/docs/features/fitbit-steps-intraday.md b/docs/features/fitbit-steps-intraday.md index aba4da84..0dbffe53 100644 --- a/docs/features/fitbit-steps-intraday.md +++ b/docs/features/fitbit-steps-intraday.md @@ -31,8 +31,8 @@ We provide examples of the input format that RAPIDS expects, note that both exam ## RAPIDS provider -!!! info "Available day segments" - - Available for all day segments +!!! info "Available time segments" + - Available for all time segments !!! info "File Sequence" ```bash @@ -51,30 +51,30 @@ Parameters description for `[FITBIT_STEPS_INTRADAY][PROVIDERS][RAPIDS]`: |`[COMPUTE]` | Set to `True` to extract `FITBIT_STEPS_INTRADAY` features from the `RAPIDS` provider| |`[FEATURES]` | Features to be computed from steps intraday data, see table below | |`[THRESHOLD_ACTIVE_BOUT]` | Every minute with Fitbit steps data wil be labelled as `sedentary` if its step count is below this threshold, otherwise, `active`. | -|`[INCLUDE_ZERO_STEP_ROWS]` | Whether or not to include day segments with a 0 step count during the whole day. | +|`[INCLUDE_ZERO_STEP_ROWS]` | Whether or not to include time segments with a 0 step count during the whole day. | Features description for `[FITBIT_STEPS_INTRADAY][PROVIDERS][RAPIDS]`: |Feature |Units |Description | |-------------------------- |-------------- |-------------------------------------------------------------| -|sumsteps |steps |The total step count during a day segment. -|maxsteps |steps |The maximum step count during a day segment. -|minsteps |steps |The minimum step count during a day segment. -|avgsteps |steps |The average step count during a day segment. -|stdsteps |steps |The standard deviation of step count during a day segment. -|countepisodesedentarybout |bouts |Number of sedentary bouts during a day segment. -|sumdurationsedentarybout |minutes |Total duration of all sedentary bouts during a day segment. -|maxdurationsedentarybout |minutes |The maximum duration of any sedentary bout during a day segment. -|mindurationsedentarybout |minutes |The minimum duration of any sedentary bout during a day segment. -|avgdurationsedentarybout |minutes |The average duration of sedentary bouts during a day segment. -|stddurationsedentarybout |minutes |The standard deviation of the duration of sedentary bouts during a day segment. -|countepisodeactivebout |bouts |Number of active bouts during a day segment. -|sumdurationactivebout |minutes |Total duration of all active bouts during a day segment. -|maxdurationactivebout |minutes |The maximum duration of any active bout during a day segment. -|mindurationactivebout |minutes |The minimum duration of any active bout during a day segment. -|avgdurationactivebout |minutes |The average duration of active bouts during a day segment. -|stddurationactivebout |minutes |The standard deviation of the duration of active bouts during a day segment. +|sumsteps |steps |The total step count during a time segment. +|maxsteps |steps |The maximum step count during a time segment. +|minsteps |steps |The minimum step count during a time segment. +|avgsteps |steps |The average step count during a time segment. +|stdsteps |steps |The standard deviation of step count during a time segment. +|countepisodesedentarybout |bouts |Number of sedentary bouts during a time segment. +|sumdurationsedentarybout |minutes |Total duration of all sedentary bouts during a time segment. +|maxdurationsedentarybout |minutes |The maximum duration of any sedentary bout during a time segment. +|mindurationsedentarybout |minutes |The minimum duration of any sedentary bout during a time segment. +|avgdurationsedentarybout |minutes |The average duration of sedentary bouts during a time segment. +|stddurationsedentarybout |minutes |The standard deviation of the duration of sedentary bouts during a time segment. +|countepisodeactivebout |bouts |Number of active bouts during a time segment. +|sumdurationactivebout |minutes |Total duration of all active bouts during a time segment. +|maxdurationactivebout |minutes |The maximum duration of any active bout during a time segment. +|mindurationactivebout |minutes |The minimum duration of any active bout during a time segment. +|avgdurationactivebout |minutes |The average duration of active bouts during a time segment. +|stddurationactivebout |minutes |The standard deviation of the duration of active bouts during a time segment. !!! note "Assumptions/Observations" diff --git a/docs/features/fitbit-steps-summary.md b/docs/features/fitbit-steps-summary.md index a9b5298a..af8bb134 100644 --- a/docs/features/fitbit-steps-summary.md +++ b/docs/features/fitbit-steps-summary.md @@ -31,7 +31,7 @@ We provide examples of the input format that RAPIDS expects, note that both exam ## RAPIDS provider -!!! info "Available day segments" +!!! info "Available time segments" - Only available for segments that span 1 or more complete days (e.g. Jan 1st 00:00 to Jan 3rd 23:59) !!! info "File Sequence" @@ -56,11 +56,11 @@ Features description for `[FITBIT_STEPS_SUMMARY][PROVIDERS][RAPIDS]`: |Feature |Units |Description | |-------------------------- |---------- |-------------------------------------------- | -|maxsumsteps |steps |The maximum daily step count during a day segment. -|minsumsteps |steps |The minimum daily step count during a day segment. -|avgsumsteps |steps |The average daily step count during a day segment. -|mediansumsteps |steps |The median of daily step count during a day segment. -|stdsumsteps |steps |The standard deviation of daily step count during a day segment. +|maxsumsteps |steps |The maximum daily step count during a time segment. +|minsumsteps |steps |The minimum daily step count during a time segment. +|avgsumsteps |steps |The average daily step count during a time segment. +|mediansumsteps |steps |The median of daily step count during a time segment. +|stdsumsteps |steps |The standard deviation of daily step count during a time segment. !!! note "Assumptions/Observations" diff --git a/docs/features/phone-accelerometer.md b/docs/features/phone-accelerometer.md index 3009a8b5..64a43380 100644 --- a/docs/features/phone-accelerometer.md +++ b/docs/features/phone-accelerometer.md @@ -8,8 +8,8 @@ Sensor parameters description for `[PHONE_ACCELEROMETER]`: ## RAPIDS provider -!!! info "Available day segments and platforms" - - Available for all day segments +!!! info "Available time segments and platforms" + - Available for all time segments - Available for Android and iOS !!! info "File Sequence" @@ -46,8 +46,8 @@ Features description for `[PHONE_ACCELEROMETER][PROVIDERS][RAPIDS]`: These features are based on the work by [Panda et al](../../citation#panda-accelerometer). -!!! info "Available day segments and platforms" - - Available for all day segments +!!! info "Available time segments and platforms" + - Available for all time segments - Available for Android and iOS !!! info "File Sequence" diff --git a/docs/features/phone-activity-recognition.md b/docs/features/phone-activity-recognition.md index 9c8a78e8..e2b791b7 100644 --- a/docs/features/phone-activity-recognition.md +++ b/docs/features/phone-activity-recognition.md @@ -10,8 +10,8 @@ Sensor parameters description for `[PHONE_ACTIVITY_RECOGNITION]`: ## RAPIDS provider -!!! info "Available day segments and platforms" - - Available for all day segments +!!! info "Available time segments and platforms" + - Available for all time segments - Available for Android and iOS !!! info "File Sequence" diff --git a/docs/features/phone-applications-foreground.md b/docs/features/phone-applications-foreground.md index dec35e92..53f1aed0 100644 --- a/docs/features/phone-applications-foreground.md +++ b/docs/features/phone-applications-foreground.md @@ -14,8 +14,8 @@ Sensor parameters description for `[PHONE_APPLICATIONS_FOREGROUND]` (these param The app category (genre) catalogue used in these features was originally created by [Stachl et al](../../citation#stachl-applications-foreground). -!!! info "Available day segments and platforms" - - Available for all day segments +!!! info "Available time segments and platforms" + - Available for all time segments - Available for Android only !!! info "File Sequence" @@ -45,9 +45,9 @@ Features description for `[PHONE_APPLICATIONS_FOREGROUND][PROVIDERS][RAPIDS]`: |Feature |Units |Description| |-------------------------- |---------- |---------------------------| |count |apps | Number of times a single app or apps within a category were used (i.e. they were brought to the foreground either by tapping their icon or switching to it from another app) -|timeoffirstuse |minutes | The time in minutes between 12:00am (midnight) and the first use of a single app or apps within a category during a `day_segment` -|timeoflastuse |minutes | The time in minutes between 12:00am (midnight) and the last use of a single app or apps within a category during a `day_segment` -|frequencyentropy |nats | The entropy of the used apps within a category during a `day_segment` (each app is seen as a unique event, the more apps were used, the higher the entropy). This is especially relevant when computed over all apps. Entropy cannot be obtained for a single app +|timeoffirstuse |minutes | The time in minutes between 12:00am (midnight) and the first use of a single app or apps within a category during a `time_segment` +|timeoflastuse |minutes | The time in minutes between 12:00am (midnight) and the last use of a single app or apps within a category during a `time_segment` +|frequencyentropy |nats | The entropy of the used apps within a category during a `time_segment` (each app is seen as a unique event, the more apps were used, the higher the entropy). This is especially relevant when computed over all apps. Entropy cannot be obtained for a single app !!! note "Assumptions/Observations" Features can be computed by app, by apps grouped under a single category (genre) and by multiple categories grouped together (meta-categories). For example, we can get features for `Facebook` (single app), for `Social Network` apps (a category including Facebook and other social media apps) or for `Social` (a meta-category formed by `Social Network` and `Social Media Tools` categories). diff --git a/docs/features/phone-battery.md b/docs/features/phone-battery.md index b19b3fbb..8b791537 100644 --- a/docs/features/phone-battery.md +++ b/docs/features/phone-battery.md @@ -9,8 +9,8 @@ Sensor parameters description for `[PHONE_BATTERY]`: ## RAPIDS provider -!!! info "Available day segments and platforms" - - Available for all day segments +!!! info "Available time segments and platforms" + - Available for all time segments - Available for Android and iOS !!! info "File Sequence" diff --git a/docs/features/phone-bluetooth.md b/docs/features/phone-bluetooth.md index c16958ed..e37b2628 100644 --- a/docs/features/phone-bluetooth.md +++ b/docs/features/phone-bluetooth.md @@ -8,8 +8,8 @@ Sensor parameters description for `[PHONE_BLUETOOTH]`: ## RAPIDS provider -!!! info "Available day segments and platforms" - - Available for all day segments +!!! info "Available time segments and platforms" + - Available for all time segments - Available for Android only !!! info "File Sequence" @@ -33,9 +33,9 @@ Features description for `[PHONE_BLUETOOTH][PROVIDERS][RAPIDS]`: |Feature |Units |Description| |-------------------------- |---------- |---------------------------| -| countscans | devices | Number of scanned devices during a `day_segment`, a device can be detected multiple times over time and these appearances are counted separately | -| uniquedevices | devices | Number of unique devices during a `day_segment` as identified by their hardware (`bt_address`) address | -| countscansmostuniquedevice | scans | Number of scans of the most scanned device during a `day_segment` across the whole monitoring period | +| countscans | devices | Number of scanned devices during a `time_segment`, a device can be detected multiple times over time and these appearances are counted separately | +| uniquedevices | devices | Number of unique devices during a `time_segment` as identified by their hardware (`bt_address`) address | +| countscansmostuniquedevice | scans | Number of scans of the most scanned device during a `time_segment` across the whole monitoring period | !!! note "Assumptions/Observations" NA diff --git a/docs/features/phone-calls.md b/docs/features/phone-calls.md index 51cd53ce..f96ef060 100644 --- a/docs/features/phone-calls.md +++ b/docs/features/phone-calls.md @@ -8,8 +8,8 @@ Sensor parameters description for `[PHONE_CALLS]`: ## RAPIDS Provider -!!! info "Available day segments and platforms" - - Available for all day segments +!!! info "Available time segments and platforms" + - Available for all time segments - Available for Android and iOS !!! info "File Sequence" @@ -35,28 +35,28 @@ Features description for `[PHONE_CALLS][PROVIDERS][RAPIDS]` incoming and outgoin |Feature |Units |Description| |-------------------------- |---------- |---------------------------| -|count |calls |Number of calls of a particular `call_type` occurred during a particular `day_segment`. -|distinctcontacts |contacts |Number of distinct contacts that are associated with a particular `call_type` for a particular `day_segment` -|meanduration |seconds |The mean duration of all calls of a particular `call_type` during a particular `day_segment`. -|sumduration |seconds |The sum of the duration of all calls of a particular `call_type` during a particular `day_segment`. -|minduration |seconds |The duration of the shortest call of a particular `call_type` during a particular `day_segment`. -|maxduration |seconds |The duration of the longest call of a particular `call_type` during a particular `day_segment`. -|stdduration |seconds |The standard deviation of the duration of all the calls of a particular `call_type` during a particular `day_segment`. -|modeduration |seconds |The mode of the duration of all the calls of a particular `call_type` during a particular `day_segment`. -|entropyduration |nats |The estimate of the Shannon entropy for the the duration of all the calls of a particular `call_type` during a particular `day_segment`. +|count |calls |Number of calls of a particular `call_type` occurred during a particular `time_segment`. +|distinctcontacts |contacts |Number of distinct contacts that are associated with a particular `call_type` for a particular `time_segment` +|meanduration |seconds |The mean duration of all calls of a particular `call_type` during a particular `time_segment`. +|sumduration |seconds |The sum of the duration of all calls of a particular `call_type` during a particular `time_segment`. +|minduration |seconds |The duration of the shortest call of a particular `call_type` during a particular `time_segment`. +|maxduration |seconds |The duration of the longest call of a particular `call_type` during a particular `time_segment`. +|stdduration |seconds |The standard deviation of the duration of all the calls of a particular `call_type` during a particular `time_segment`. +|modeduration |seconds |The mode of the duration of all the calls of a particular `call_type` during a particular `time_segment`. +|entropyduration |nats |The estimate of the Shannon entropy for the the duration of all the calls of a particular `call_type` during a particular `time_segment`. |timefirstcall |minutes |The time in minutes between 12:00am (midnight) and the first call of `call_type`. |timelastcall |minutes |The time in minutes between 12:00am (midnight) and the last call of `call_type`. -|countmostfrequentcontact |calls |The number of calls of a particular `call_type` during a particular `day_segment` of the most frequent contact throughout the monitored period. +|countmostfrequentcontact |calls |The number of calls of a particular `call_type` during a particular `time_segment` of the most frequent contact throughout the monitored period. Features description for `[PHONE_CALLS][PROVIDERS][RAPIDS]` missed calls: |Feature |Units |Description| |-------------------------- |---------- |---------------------------| -|count |calls |Number of `missed` calls that occurred during a particular `day_segment`. -|distinctcontacts |contacts |Number of distinct contacts that are associated with `missed` calls for a particular `day_segment` +|count |calls |Number of `missed` calls that occurred during a particular `time_segment`. +|distinctcontacts |contacts |Number of distinct contacts that are associated with `missed` calls for a particular `time_segment` |timefirstcall |minutes |The time in hours from 12:00am (Midnight) that the first `missed` call occurred. |timelastcall |minutes |The time in hours from 12:00am (Midnight) that the last `missed` call occurred. -|countmostfrequentcontact |calls |The number of `missed` calls during a particular `day_segment` of the most frequent contact throughout the monitored period. +|countmostfrequentcontact |calls |The number of `missed` calls during a particular `time_segment` of the most frequent contact throughout the monitored period. !!! note "Assumptions/Observations" 1. Traces for iOS calls are unique even for the same contact calling a participant more than once which renders `countmostfrequentcontact` meaningless and `distinctcontacts` equal to the total number of traces. diff --git a/docs/features/phone-conversation.md b/docs/features/phone-conversation.md index 4e96e369..d67747aa 100644 --- a/docs/features/phone-conversation.md +++ b/docs/features/phone-conversation.md @@ -9,8 +9,8 @@ Sensor parameters description for `[PHONE_CONVERSATION]`: ## RAPIDS provider -!!! info "Available day segments and platforms" - - Available for all day segments +!!! info "Available time segments and platforms" + - Available for all time segments - Available for Android only !!! info "File Sequence" @@ -46,8 +46,8 @@ Features description for `[PHONE_CONVERSATION][PROVIDERS][RAPIDS]`: | minconversationduration | minutes | Shortest duration of all conversations | | avgconversationduration | minutes | Average duration of all conversations | | sdconversationduration | minutes | Standard Deviation of the duration of all conversations | -| timefirstconversation | minutes | Minutes since midnight when the first conversation for a day segment was detected | -| timelastconversation | minutes | Minutes since midnight when the last conversation for a day segment was detected | +| timefirstconversation | minutes | Minutes since midnight when the first conversation for a time segment was detected | +| timelastconversation | minutes | Minutes since midnight when the last conversation for a time segment was detected | | noisesumenergy | L2-norm | Sum of all energy values when inference is noise | | noiseavgenergy | L2-norm | Average of all energy values when inference is noise | | noisesdenergy | L2-norm | Standard Deviation of all energy values when inference is noise | diff --git a/docs/features/phone-data-yield.md b/docs/features/phone-data-yield.md index a51775da..5327a131 100644 --- a/docs/features/phone-data-yield.md +++ b/docs/features/phone-data-yield.md @@ -1,6 +1,6 @@ # Phone Data Yield -This is a combinatorial sensor which means that we use the data from multiple sensors to extract data yield features. Data yield features can be used to remove rows ([day segments](../../setup/configuration/#day-segments)) that do not contain enough data. You should decide what is your "enough" threshold depending on the type of sensors you collected (frequency vs event based, e.g. acceleroemter vs calls), the length of your study, and the rates of missing data that your analysis could handle. +This is a combinatorial sensor which means that we use the data from multiple sensors to extract data yield features. Data yield features can be used to remove rows ([time segments](../../setup/configuration/#time-segments)) that do not contain enough data. You should decide what is your "enough" threshold depending on the type of sensors you collected (frequency vs event based, e.g. acceleroemter vs calls), the length of your study, and the rates of missing data that your analysis could handle. !!! hint "Why is data yield important?" Imagine that you want to extract `PHONE_CALL` features on daily segments (`00:00` to `23:59`). Let's say that on day 1 the phone logged 10 calls and 23 hours of data from other sensors and on day 2 the phone logged 10 calls and only 2 hours of data from other sensors. It's more likely that other calls were placed on the 22 hours of data that you didn't log on day 2 than on the 1 hour of data you didn't log on day 1, and so including day 2 in your analysis could bias your results. @@ -35,10 +35,10 @@ Before explaining the data yield features, let's define the following relevant c - A valid minute is any 60 second window when any phone sensor logged at least 1 row of data - A valid hour is any 60 minute window with at least X valid minutes. The X or threshold is given by `[MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS]` -The timestamps of all sensors are concatenated and then grouped per day segment. Minute and hour windows are created from the beginning of each day segment instance and these windows are marked as valid based on the definitions above. The duration of each day segment is taken into account to compute the features described below. +The timestamps of all sensors are concatenated and then grouped per time segment. Minute and hour windows are created from the beginning of each time segment instance and these windows are marked as valid based on the definitions above. The duration of each time segment is taken into account to compute the features described below. -!!! info "Available day segments and platforms" - - Available for all day segments +!!! info "Available time segments and platforms" + - Available for all time segments - Available for Android and iOS !!! info "File Sequence" @@ -64,14 +64,14 @@ Features description for `[PHONE_DATA_YIELD][PROVIDERS][RAPIDS]`: |Feature |Units |Description| |-------------------------- |---------- |---------------------------| -|ratiovalidyieldedminutes |rows | The ratio between the number of valid minutes and the duration in minutes of a day segment. -|ratiovalidyieldedhours |lux | The ratio between the number of valid hours and the duration in hours of a day segment. If the day segment is shorter than 1 hour this feature will always be 1. +|ratiovalidyieldedminutes |rows | The ratio between the number of valid minutes and the duration in minutes of a time segment. +|ratiovalidyieldedhours |lux | The ratio between the number of valid hours and the duration in hours of a time segment. If the time segment is shorter than 1 hour this feature will always be 1. !!! note "Assumptions/Observations" - 1. We recommend using `ratiovalidyieldedminutes` on day segments that are shorter than two or three hours and `ratiovalidyieldedhours` for longer segments. This is because relying on yielded minutes only can be misleading when a big chunk of those missing minutes are clustered together. + 1. We recommend using `ratiovalidyieldedminutes` on time segments that are shorter than two or three hours and `ratiovalidyieldedhours` for longer segments. This is because relying on yielded minutes only can be misleading when a big chunk of those missing minutes are clustered together. - For example, let's assume we are working with a 24-hour day segment that is missing 12 hours of data. Two extreme cases can occur: + For example, let's assume we are working with a 24-hour time segment that is missing 12 hours of data. Two extreme cases can occur:
  1. the 12 missing hours are from the beginning of the segment or
  2. diff --git a/docs/features/phone-light.md b/docs/features/phone-light.md index 26b66ab0..e44b81e8 100644 --- a/docs/features/phone-light.md +++ b/docs/features/phone-light.md @@ -8,8 +8,8 @@ Sensor parameters description for `[PHONE_LIGHT]`: ## RAPIDS provider -!!! info "Available day segments and platforms" - - Available for all day segments +!!! info "Available time segments and platforms" + - Available for all time segments - Available for Android only !!! info "File Sequence" diff --git a/docs/features/phone-locations.md b/docs/features/phone-locations.md index 8c45ac66..646dc9b0 100644 --- a/docs/features/phone-locations.md +++ b/docs/features/phone-locations.md @@ -11,7 +11,7 @@ Sensor parameters description for `[PHONE_LOCATIONS]`: !!! note "Assumptions/Observations" **Types of location data to use** - AWARE Android and iOS clients can collect location coordinates through the phone\'s GPS, the network cellular towers around the phone or Google\'s fused location API. If you want to use only the GPS provider set `[LOCATIONS_TO_USE]` to `GPS`, if you want to use all providers (not recommended due to the difference in accuracy) set `[LOCATIONS_TO_USE]` to `ALL`, if your AWARE client was configured to use fused location only or want to focus only on this provider, set `[LOCATIONS_TO_USE]` to `RESAMPLE_FUSED`. `RESAMPLE_FUSED` takes the original fused location coordinates and replicates each pair forward in time as long as the phone was sensing data as indicated by [`PHONE_VALID_SENSED_BINS`](../phone-data-quality/#phone-valid-sensed-bins), this is done because Google\'s API only logs a new location coordinate pair when it is sufficiently different in time or space from the previous one. + AWARE Android and iOS clients can collect location coordinates through the phone\'s GPS, the network cellular towers around the phone or Google\'s fused location API. If you want to use only the GPS provider set `[LOCATIONS_TO_USE]` to `GPS`, if you want to use all providers (not recommended due to the difference in accuracy) set `[LOCATIONS_TO_USE]` to `ALL`, if your AWARE client was configured to use fused location only or want to focus only on this provider, set `[LOCATIONS_TO_USE]` to `RESAMPLE_FUSED`. `RESAMPLE_FUSED` takes the original fused location coordinates and replicates each pair forward in time as long as the phone was sensing data as indicated by the joined timestamps of [`[PHONE_DATA_YIELD][SENSORS]`](../phone-data-yield/), this is done because Google\'s API only logs a new location coordinate pair when it is sufficiently different in time or space from the previous one. There are two parameters associated with resampling fused location. `FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair (for example, participant A\'s phone did not collect data between 10.30am and 10:50am and between 11:05am and 11:40am, the last known coordinate pair will be replicated during the first period but not the second, in other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes). `FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION` (in minutes, default 720 or 12 hours) stops the last known fused location from being replicated longer that this threshold even if the phone was sensing data continuously (for example, participant A went home at 9pm and their phone was sensing data without gaps until 11am the next morning, the last known location will only be replicated until 9am). If you have suggestions to modify or improve this resampling, let us know. @@ -20,7 +20,7 @@ Sensor parameters description for `[PHONE_LOCATIONS]`: These features are based on the original open-source implementation by [Barnett et al](../../citation#barnett-locations) and some features created by [Canzian et al](../../citation#barnett-locations). -!!! info "Available day segments and platforms" +!!! info "Available time segments and platforms" - Available only for segments that start at 00:00:00 and end at 23:59:59 of the same day (daily segments) - Available for Android and iOS @@ -42,7 +42,7 @@ Parameters description for `[PHONE_LOCATIONS][PROVIDERS][BARNETT]`: |`[FEATURES]` | Features to be computed, see table below |`[ACCURACY_LIMIT]` | An integer in meters, any location rows with an accuracy higher than this will be dropped. This number means there's a 68% probability the true location is within this radius |`[TIMEZONE]` | Timezone where the location data was collected. By default points to the one defined in the [Configuration](../../setup/configuration#timezone-of-your-study) -|`[MINUTES_DATA_USED]` | Set to `True` to include an extra column in the final location feature file containing the number of minutes used to compute the features on each day segment. Use this for quality control purposes, the more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough. +|`[MINUTES_DATA_USED]` | Set to `True` to include an extra column in the final location feature file containing the number of minutes used to compute the features on each time segment. Use this for quality control purposes, the more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough. @@ -80,8 +80,8 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][BARNETT]` adapted from [B These features are based on the original implementation by [Doryab et al.](../../citation#doryab-locations). -!!! info "Available day segments and platforms" - - Available for all day segments +!!! info "Available time segments and platforms" + - Available for all time segments - Available for Android and iOS !!! info "File Sequence" @@ -104,7 +104,7 @@ Parameters description for `[PHONE_LOCATIONS][PROVIDERS][BARNETT]`: | `[DBSCAN_MINSAMPLES]` | The number of samples (or total weight) in a neighborhood for a point to be considered as a core point of a cluster. This includes the point itself. | `[THRESHOLD_STATIC]` | It is the threshold value in km/hr which labels a row as Static or Moving. | `[MAXIMUM_GAP_ALLOWED]` | The maximum gap (in seconds) allowed between any two consecutive rows for them to be considered part of the same displacement. If this threshold is too high, it can throw speed and distance calculations off for periods when the the phone was not sensing. -| `[MINUTES_DATA_USED]` | Set to `True` to include an extra column in the final location feature file containing the number of minutes used to compute the features on each day segment. Use this for quality control purposes, the more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough. +| `[MINUTES_DATA_USED]` | Set to `True` to include an extra column in the final location feature file containing the number of minutes used to compute the features on each time segment. Use this for quality control purposes, the more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough. | `[SAMPLING_FREQUENCY]` | Expected time difference between any two location rows in minutes. If set to `0`, the sampling frequency will be inferred automatically as the median of all the differences between any two consecutive row timestamps (recommended if you are using `FUSED_RESAMPLED` data). This parameter impacts all the time calculations. @@ -114,18 +114,18 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][BARNETT]`: |-------------------------- |---------- |---------------------------| |locationvariance |$meters^2$ |The sum of the variances of the latitude and longitude columns. |loglocationvariance | - | Log of the sum of the variances of the latitude and longitude columns. -|totaldistance |meters |Total distance travelled in a day segment using the haversine formula. -|averagespeed |km/hr |Average speed in a day segment considering only the instances labeled as Moving. -|varspeed |km/hr |Speed variance in a day segment considering only the instances labeled as Moving. +|totaldistance |meters |Total distance travelled in a time segment using the haversine formula. +|averagespeed |km/hr |Average speed in a time segment considering only the instances labeled as Moving. +|varspeed |km/hr |Speed variance in a time segment considering only the instances labeled as Moving. |circadianmovement |- | \"It encodes the extent to which a person's location patterns follow a 24-hour circadian cycle.\" [Doryab et al.](../../citation#doryab-locations). |numberofsignificantplaces |places |Number of significant locations visited. It is calculated using the DBSCAN clustering algorithm which takes in EPS and MIN_SAMPLES as parameters to identify clusters. Each cluster is a significant place. -|numberlocationtransitions |transitions |Number of movements between any two clusters in a day segment. +|numberlocationtransitions |transitions |Number of movements between any two clusters in a time segment. |radiusgyration |meters |Quantifies the area covered by a participant |timeattop1location |minutes |Time spent at the most significant location. |timeattop2location |minutes |Time spent at the 2nd most significant location. |timeattop3location |minutes |Time spent at the 3rd most significant location. |movingtostaticratio | - | Ratio between the number of rows labeled Moving versus Static -|outlierstimepercent | - | Ratio between the number of rows that belong to non-significant clusters divided by the total number of rows in a day segment. +|outlierstimepercent | - | Ratio between the number of rows that belong to non-significant clusters divided by the total number of rows in a time segment. |maxlengthstayatclusters |minutes |Maximum time spent in a cluster (significant location). |minlengthstayatclusters |minutes |Minimum time spent in a cluster (significant location). |meanlengthstayatclusters |minutes |Average time spent in a cluster (significant location). diff --git a/docs/features/phone-messages.md b/docs/features/phone-messages.md index fec2957a..2c3d3108 100644 --- a/docs/features/phone-messages.md +++ b/docs/features/phone-messages.md @@ -8,8 +8,8 @@ Sensor parameters description for `[PHONE_MESSAGES]`: ## RAPIDS provider -!!! info "Available day segments and platforms" - - Available for all day segments +!!! info "Available time segments and platforms" + - Available for all time segments - Available for Android only !!! info "File Sequence" @@ -34,11 +34,11 @@ Features description for `[PHONE_MESSAGES][PROVIDERS][RAPIDS]`: |Feature |Units |Description| |-------------------------- |---------- |---------------------------| -|count |messages |Number of messages of type `messages_type` that occurred during a particular `day_segment`. -|distinctcontacts |contacts |Number of distinct contacts that are associated with a particular `messages_type` during a particular `day_segment`. -|timefirstmessages |minutes |Number of minutes between 12:00am (midnight) and the first `message` of a particular `messages_type` during a particular `day_segment`. -|timelastmessages |minutes |Number of minutes between 12:00am (midnight) and the last `message` of a particular `messages_type` during a particular `day_segment`. -|countmostfrequentcontact |messages |Number of messages from the contact with the most messages of `messages_type` during a `day_segment` throughout the whole dataset of each participant. +|count |messages |Number of messages of type `messages_type` that occurred during a particular `time_segment`. +|distinctcontacts |contacts |Number of distinct contacts that are associated with a particular `messages_type` during a particular `time_segment`. +|timefirstmessages |minutes |Number of minutes between 12:00am (midnight) and the first `message` of a particular `messages_type` during a particular `time_segment`. +|timelastmessages |minutes |Number of minutes between 12:00am (midnight) and the last `message` of a particular `messages_type` during a particular `time_segment`. +|countmostfrequentcontact |messages |Number of messages from the contact with the most messages of `messages_type` during a `time_segment` throughout the whole dataset of each participant. !!! note "Assumptions/Observations" 1. `[MESSAGES_TYPES]` and `[FEATURES]` keys in `config.yaml` need to match. For example, `[MESSAGES_TYPES]` `sent` matches the `[FEATURES]` key `sent` diff --git a/docs/features/phone-screen.md b/docs/features/phone-screen.md index c1203f7b..e438e328 100644 --- a/docs/features/phone-screen.md +++ b/docs/features/phone-screen.md @@ -8,8 +8,8 @@ Sensor parameters description for `[PHONE_SCREEN]`: ## RAPIDS provider -!!! info "Available day segments and platforms" - - Available for all day segments +!!! info "Available time segments and platforms" + - Available for all time segments - Available for Android and iOS !!! info "File Sequence" diff --git a/docs/features/phone-wifi-connected.md b/docs/features/phone-wifi-connected.md index 216a22ee..537bfd44 100644 --- a/docs/features/phone-wifi-connected.md +++ b/docs/features/phone-wifi-connected.md @@ -8,8 +8,8 @@ Sensor parameters description for `[PHONE_WIFI_CONNECTED]`: ## RAPIDS provider -!!! info "Available day segments and platforms" - - Available for all day segments +!!! info "Available time segments and platforms" + - Available for all time segments - Available for Android and iOS !!! info "File Sequence" @@ -33,9 +33,9 @@ Features description for `[PHONE_WIFI_CONNECTED][PROVIDERS][RAPIDS]`: |Feature |Units |Description| |-------------------------- |---------- |---------------------------| -| countscans | devices | Number of scanned WiFi access points connected during a day_segment, an access point can be detected multiple times over time and these appearances are counted separately | -| uniquedevices | devices | Number of unique access point during a day_segment as identified by their hardware address | -| countscansmostuniquedevice | scans | Number of scans of the most scanned access point during a day_segment across the whole monitoring period | +| countscans | devices | Number of scanned WiFi access points connected during a time_segment, an access point can be detected multiple times over time and these appearances are counted separately | +| uniquedevices | devices | Number of unique access point during a time_segment as identified by their hardware address | +| countscansmostuniquedevice | scans | Number of scans of the most scanned access point during a time_segment across the whole monitoring period | !!! note "Assumptions/Observations" 1. A connected WiFI access point is one that a phone was connected to. diff --git a/docs/features/phone-wifi-visible.md b/docs/features/phone-wifi-visible.md index 2a098e81..60786e07 100644 --- a/docs/features/phone-wifi-visible.md +++ b/docs/features/phone-wifi-visible.md @@ -8,8 +8,8 @@ Sensor parameters description for `[PHONE_WIFI_VISIBLE]`: ## RAPIDS provider -!!! info "Available day segments and platforms" - - Available for all day segments +!!! info "Available time segments and platforms" + - Available for all time segments - Available for Android only !!! info "File Sequence" @@ -33,9 +33,9 @@ Features description for `[PHONE_WIFI_VISIBLE][PROVIDERS][RAPIDS]`: |Feature |Units |Description| |-------------------------- |---------- |---------------------------| -| countscans | devices | Number of scanned WiFi access points visible during a day_segment, an access point can be detected multiple times over time and these appearances are counted separately | -| uniquedevices | devices | Number of unique access point during a day_segment as identified by their hardware address | -| countscansmostuniquedevice | scans | Number of scans of the most scanned access point during a day_segment across the whole monitoring period | +| countscans | devices | Number of scanned WiFi access points visible during a time_segment, an access point can be detected multiple times over time and these appearances are counted separately | +| uniquedevices | devices | Number of unique access point during a time_segment as identified by their hardware address | +| countscansmostuniquedevice | scans | Number of scans of the most scanned access point during a time_segment across the whole monitoring period | !!! note "Assumptions/Observations" 1. A visible WiFI access point is one that a phone sensed around itself but that it was not connected to. Due to API restrictions, this sensor is not available on iOS. diff --git a/docs/file-structure.md b/docs/file-structure.md index b3744db5..d4c3f6ad 100644 --- a/docs/file-structure.md +++ b/docs/file-structure.md @@ -5,14 +5,14 @@ All paths mentioned in this page are relative to RAPIDS' root folder. -If you want to extract the behavioral features that RAPIDS offers, you will only have to create or modify the [`.env` file](../setup/configuration/#database-credentials), [participants files](../setup/configuration/#participant-files), [day segment files](../setup/configuration/#day-segments), and the `config.yaml` file. The `config.yaml` file is the heart of RAPIDS and includes parameters to manage participants, data sources, sensor data, visualizations and more. +If you want to extract the behavioral features that RAPIDS offers, you will only have to create or modify the [`.env` file](../setup/configuration/#database-credentials), [participants files](../setup/configuration/#participant-files), [time segment files](../setup/configuration/#time-segments), and the `config.yaml` file. The `config.yaml` file is the heart of RAPIDS and includes parameters to manage participants, data sources, sensor data, visualizations and more. All data is saved in `data/`. The `data/external/` folder stores any data imported or created by the user, `data/raw/` stores sensor data as imported from your database, `data/interim/` has intermediate files necessary to compute behavioral features from raw data, and `data/processed/` has all the final files with the behavioral features in folders per participant and sensor. All the source code is saved in `src/`. The `src/data/` folder stores scripts to download, clean and pre-process sensor data, `src/features` has scripts to extract behavioral features organized in their respective subfolders , `src/models/` can host any script to create models or statistical analyses with the behavioral features you extract, and `src/visualization/` has scripts to create plots of the raw and processed data. -There are other important files and folders but only relevant if you are interested in extending RAPIDS (e.g. virtual env files, docs, tests, Dockerfile, the Snakefile, etc.). In the figure below, we represent the interactions between users and files. After a user modifies `config.yaml` and `.env` the `Snakefile` file will decide what Snakemake rules have to be executed to produce the required output files (behavioral features) and what scripts are in charge of producing such files. In addition, users can add or modifiy files in the `data` folder (for example to configure the [participants files](../setup/configuration/#participant-files) or the [day segment files](../setup/configuration/#day-segments)). +There are other important files and folders but only relevant if you are interested in extending RAPIDS (e.g. virtual env files, docs, tests, Dockerfile, the Snakefile, etc.). In the figure below, we represent the interactions between users and files. After a user modifies `config.yaml` and `.env` the `Snakefile` file will decide what Snakemake rules have to be executed to produce the required output files (behavioral features) and what scripts are in charge of producing such files. In addition, users can add or modifiy files in the `data` folder (for example to configure the [participants files](../setup/configuration/#participant-files) or the [time segment files](../setup/configuration/#time-segments)).
    diff --git a/docs/index.md b/docs/index.md index 9d47a173..7b43adda 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,7 +1,7 @@ # Welcome to RAPIDS documentation !!! warning - The functionality outlined in these docs is implemented in the branch `day_segments` which we will merge to `master` soon as release `0.1`. The previous (first) release of RAPIDS along with the old [docs](https://rapidspitt.readthedocs.io/en/latest/) will be labeled `beta`. If you landed on this page feel free to look around, just have in mind that we are polishing the last rough patches before we advertise `0.1` (Nov 16, 2020) + The functionality outlined in these docs is implemented in the branch `time_segments` which we will merge to `master` soon as release `0.1`. The previous (first) release of RAPIDS along with the old [docs](https://rapidspitt.readthedocs.io/en/latest/) will be labeled `beta`. If you landed on this page feel free to look around, just have in mind that we are polishing the last rough patches before we advertise `0.1` (Nov 16, 2020) Reproducible Analysis Pipeline for Data Streams (RAPIDS) allows you to process smartphone and wearable data to extract **behavioral features** (a.k.a. digital biomarkers/phenotypes). @@ -26,11 +26,11 @@ RAPIDS is formed by R and Python scripts orchestrated by [Snakemake](https://sna 5. **Parallel execution**. Thanks to Snakemake, your analysis can be executed over multiple cores without changing your code. 6. **Extensible code**. You can easily add your own behavioral features in R or Python and keep authorship and citations. 3. **Timezone aware**. Your data is adjusted to the specified timezone (multiple timezones suport *coming soon*). -4. **Flexible day segments**. You can extract behavioral features on time windows of any length (e.g. 5 minutes, 3 hours, 2 days), on every day or particular days (e.g. weekends, Mondays, the 1st of each month, etc.) or around events of interest (e.g. surveys or clinical relapses). +4. **Flexible time segments**. You can extract behavioral features on time windows of any length (e.g. 5 minutes, 3 hours, 2 days), on every day or particular days (e.g. weekends, Mondays, the 1st of each month, etc.) or around events of interest (e.g. surveys or clinical relapses). 7. **Tested code**. We are constantly adding tests to make sure our behavioral features are correct. 8. **Reproducible code**. You can be sure your code will run in other computers as intended thanks to R and Python virtual environments. You can share your analysis code along your publications without any overhead. 9. **Private**. All your data is processed locally. ## How is it organized? -In broad terms the `config.yaml`, [`.env` file](../setup/configuration/#database-credentials), [participants files](../setup/configuration/#participant-files), [day segment files](../setup/configuration/#day-segments) are the only ones that you will have to modify. All data is stored in `data/` and all scripts are stored in `src/`. For more information see RAPIDS' [File Structure](file-structure.md). \ No newline at end of file +In broad terms the `config.yaml`, [`.env` file](../setup/configuration/#database-credentials), [participants files](../setup/configuration/#participant-files), [time segment files](../setup/configuration/#time-segments) are the only ones that you will have to modify. All data is stored in `data/` and all scripts are stored in `src/`. For more information see RAPIDS' [File Structure](file-structure.md). \ No newline at end of file diff --git a/docs/setup/configuration.md b/docs/setup/configuration.md index 706617fd..a77aaeda 100644 --- a/docs/setup/configuration.md +++ b/docs/setup/configuration.md @@ -6,7 +6,7 @@ You need to follow these steps to configure your RAPIDS deployment before you ca 1. Add your [database credentials](#database-credentials) 2. Choose the [timezone of your study](#timezone-of-your-study) 3. Create your [participants files](#participant-files) -4. Select what [day segments](#day-segments) you want to extract features on +4. Select what [time segments](#time-segments) you want to extract features on 5. Modify your [device data source configuration](#device-data-source-configuration) 6. Select what [sensors and features](#sensor-and-features-to-process) you want to process @@ -197,27 +197,27 @@ You have two options a) use the `aware_device` table in your database or b) use --- -## Day Segments +## Time Segments -Day segments (or epochs) are the time windows on which you want to extract behavioral features. For example, you might want to process data on every day, every morning, or only during weekends. RAPIDS offers three categories of day segments that are flexible enough to cover most use cases: **frequency** (short time windows every day), **periodic** (arbitrary time windows on any day), and **event** (arbitrary time windows around events of interest). See also our [examples](#segment-examples). +Time segments (or epochs) are the time windows on which you want to extract behavioral features. For example, you might want to process data on every day, every morning, or only during weekends. RAPIDS offers three categories of time segments that are flexible enough to cover most use cases: **frequency** (short time windows every day), **periodic** (arbitrary time windows on any day), and **event** (arbitrary time windows around events of interest). See also our [examples](#segment-examples). === "Frequency Segments" These segments are computed on every day and all have the same duration (for example 30 minutes). Set the following keys in your `config.yaml` ```yaml - DAY_SEGMENTS: &day_segments + TIME_SEGMENTS: &time_segments TYPE: FREQUENCY FILE: "data/external/your_frequency_segments.csv" INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE ``` - The file pointed by `[DAY_SEGMENTS][FILE]` should have the following format and can only have 1 row. + The file pointed by `[TIME_SEGMENTS][FILE]` should have the following format and can only have 1 row. | Column | Description | |--------|----------------------------------------------------------------------| - | label | A string that is used as a prefix in the name of your day segments | - | length | An integer representing the duration of your day segments in minutes | + | label | A string that is used as a prefix in the name of your time segments | + | length | An integer representing the duration of your time segments in minutes | !!! example @@ -226,7 +226,7 @@ Day segments (or epochs) are the time windows on which you want to extract behav thirtyminutes,30 ``` - This configuration will compute 48 day segments for every day when any data from any participant was sensed. For example: + This configuration will compute 48 time segments for every day when any data from any participant was sensed. For example: ```csv start_time,length,label @@ -242,7 +242,7 @@ Day segments (or epochs) are the time windows on which you want to extract behav These segments can be computed every day, or on specific days of the week, month, quarter, and year. Their minimum duration is 1 minute but they can be as long as you want. Set the following keys in your `config.yaml`. ```yaml - DAY_SEGMENTS: &day_segments + TIME_SEGMENTS: &time_segments TYPE: PERIODIC FILE: "data/external/your_periodic_segments.csv" INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # or TRUE @@ -250,11 +250,11 @@ Day segments (or epochs) are the time windows on which you want to extract behav If `[INCLUDE_PAST_PERIODIC_SEGMENTS]` is set to `TRUE`, RAPIDS will consider instances of your segments back enough in the past as to include the first row of data of each participant. For example, if the first row of data from a participant happened on Saturday March 7th 2020 and the requested segment duration is 7 days starting on every Sunday, the first segment to be considered would start on Sunday March 1st if `[INCLUDE_PAST_PERIODIC_SEGMENTS]` is `TRUE` or on Sunday March 8th if `FALSE`. - The file pointed by `[DAY_SEGMENTS][FILE]` should have the following format and can have multiple rows. + The file pointed by `[TIME_SEGMENTS][FILE]` should have the following format and can have multiple rows. | Column | Description | |---------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| - | label | A string that is used as a prefix in the name of your day segments. It has to be **unique** between rows | + | label | A string that is used as a prefix in the name of your time segments. It has to be **unique** between rows | | start_time | A string with format `HH:MM:SS` representing the starting time of this segment on any day | | length | A string representing the length of this segment.It can have one or more of the following strings **`XXD XXH XXM XXS`** to represent days, hours, minutes and seconds. For example `7D 23H 59M 59S` | | repeats_on | One of the follow options `every_day`, `wday`, `qday`, `mday`, and `yday`. The last four represent a week, quarter, month and year day | @@ -278,18 +278,18 @@ Day segments (or epochs) are the time windows on which you want to extract behav These segments can be computed before or after an event of interest (defined as any UNIX timestamp). Their minimum duration is 1 minute but they can be as long as you want. The start of each segment can be shifted backwards or forwards from the specified timestamp. Set the following keys in your `config.yaml`. ```yaml - DAY_SEGMENTS: &day_segments + TIME_SEGMENTS: &time_segments TYPE: EVENT FILE: "data/external/your_event_segments.csv" INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # or TRUE ``` - The file pointed by `[DAY_SEGMENTS][FILE]` should have the following format and can have multiple rows. + The file pointed by `[TIME_SEGMENTS][FILE]` should have the following format and can have multiple rows. | Column | Description | |---------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| - | label | A string that is used as a prefix in the name of your day segments. If labels are unique, every segment is independent; if two or more segments have the same label, their data will be grouped when computing auxiliary data for features like the `most frequent contact` for calls (the most frequent contact will be computed across all these segments). There cannot be two *overlaping* event segments with the same label (RAPIDS will throw an error) | - | event_timestamp | A UNIX timestamp that represents the moment an event of interest happened (clinical relapse, survey, readmission, etc.). The corresponding day segment will be computed around this moment using `length`, `shift`, and `shift_direction` | + | label | A string that is used as a prefix in the name of your time segments. If labels are unique, every segment is independent; if two or more segments have the same label, their data will be grouped when computing auxiliary data for features like the `most frequent contact` for calls (the most frequent contact will be computed across all these segments). There cannot be two *overlaping* event segments with the same label (RAPIDS will throw an error) | + | event_timestamp | A UNIX timestamp that represents the moment an event of interest happened (clinical relapse, survey, readmission, etc.). The corresponding time segment will be computed around this moment using `length`, `shift`, and `shift_direction` | | length | A string representing the length of this segment. It can have one or more of the following keys `XXD XXH XXM XXS` to represent a number of days, hours, minutes, and seconds. For example `7D 23H 59M 59S` | | shift | A string representing the time shift from `event_timestamp`. It can have one or more of the following keys `XXD XXH XXM XXS` to represent a number of days, hours, minutes and seconds. For example `7D 23H 59M 59S`. Use this value to change the start of a segment with respect to its `event_timestamp`. For example, set this variable to `1H` to create a segment that starts 1 hour from an event of interest (`shift_direction` determines if it's before or after). | | shift_direction | An integer representing whether the `shift` is before (`-1`) or after (`1`) an `event_timestamp` | @@ -308,7 +308,7 @@ Day segments (or epochs) are the time windows on which you want to extract behav mood,1587906020000,7D,0,0,a748ee1a-1d0b-4ae9-9074-279a2b6ba524 ``` - This example will create eight segments for a single participant (`a748ee1a...`), five independent `stressX` segments with various lengths (1,4,3,7, and 9 hours). Segments `stress1`, `stress3`, and `stress5` are shifted forwards by 5 minutes and `stress2` and `stress4` are shifted backwards by 4 hours (that is, if the `stress4` event happened on March 15th at 1pm EST (`1584291600000`), the day segment will start on that day at 9am and end at 4pm). + This example will create eight segments for a single participant (`a748ee1a...`), five independent `stressX` segments with various lengths (1,4,3,7, and 9 hours). Segments `stress1`, `stress3`, and `stress5` are shifted forwards by 5 minutes and `stress2` and `stress4` are shifted backwards by 4 hours (that is, if the `stress4` event happened on March 15th at 1pm EST (`1584291600000`), the time segment will start on that day at 9am and end at 4pm). The three `mood` segments are 1 hour, 1 day and 7 days long and have no shift. In addition, these `mood` segments are grouped together, meaning that although RAPIDS will compute features on each one of them, some necessary information to compute a few of such features will be extracted from all three segments, for example the phone contact that called a participant the most or the location clusters visited by a participant. diff --git a/docs/setup/execution.md b/docs/setup/execution.md index 626fbf5a..8ccf2ef8 100644 --- a/docs/setup/execution.md +++ b/docs/setup/execution.md @@ -29,8 +29,8 @@ After you have [installed](../installation) and [configured](../configuration) R ``` !!! hint "Deleting RAPIDS output" - If you want to delete all the output files RAPIDS produces you can execute the following command (the content of these folders will be deleted: `data/raw`, `data/interim`, `data/processed`, `reports/figures`, and `reports/compliance`) + If you want to delete all the output files RAPIDS produces you can execute the following command: ```bash - ./rapids -j1 -R clean + ./rapids -j1 --delete-all-output ``` diff --git a/docs/workflow-examples/minimal.md b/docs/workflow-examples/minimal.md index 41743087..fc21a30a 100644 --- a/docs/workflow-examples/minimal.md +++ b/docs/workflow-examples/minimal.md @@ -21,7 +21,7 @@ This is a quick guide for creating and running a simple pipeline to extract miss END_DATE: 2021-01-01 # this can also be empty ``` - 4\. `[DAY_SEGMENTS][TYPE]` should be the default `PERIODIC`. Change `[DAY_SEGMENTS][FILE]` with the path of a file containing the following lines: + 4\. `[TIME_SEGMENTS][TYPE]` should be the default `PERIODIC`. Change `[TIME_SEGMENTS][FILE]` with the path of a file containing the following lines: ```csv label,start_time,length,repeats_on,repeats_value daily,00:00:00,23H 59M 59S,every_day,0 @@ -45,9 +45,9 @@ This is a quick guide for creating and running a simple pipeline to extract miss # ... other irrelevant sections - DAY_SEGMENTS: &day_segments + TIME_SEGMENTS: &time_segments TYPE: PERIODIC - FILE: "data/external/daysegments_periodic.csv" # make sure the three lines specified above are in the file + FILE: "data/external/timesegments_periodic.csv" # make sure the three lines specified above are in the file INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # No need to change this if you collected AWARE data on a database and your credentials are grouped under `MY_GROUP` in `.env` @@ -80,7 +80,7 @@ This is a quick guide for creating and running a simple pipeline to extract miss ```bash ./rapids -j1 ``` -4. The call features for daily and morning day segments will be in +4. The call features for daily and morning time segments will be in ``` /data/processed/features/p01/phone_calls.csv ``` diff --git a/example_profile/example_config.yaml b/example_profile/example_config.yaml index d6e49640..8285c716 100644 --- a/example_profile/example_config.yaml +++ b/example_profile/example_config.yaml @@ -25,10 +25,10 @@ CREATE_PARTICIPANT_FILES: DEVICE_ID_COLUMN: device_id # column name IGNORED_DEVICE_IDS: [] -# See https://www.rapids.science/setup/configuration/#day-segments -DAY_SEGMENTS: &day_segments +# See https://www.rapids.science/setup/configuration/#time-segments +TIME_SEGMENTS: &time_segments TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT - FILE: "example_profile/exampleworkflow_daysegments.csv" + FILE: "example_profile/exampleworkflow_timesegments.csv" INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, see docs diff --git a/example_profile/exampleworkflow_daysegments.csv b/example_profile/exampleworkflow_timesegments.csv similarity index 100% rename from example_profile/exampleworkflow_daysegments.csv rename to example_profile/exampleworkflow_timesegments.csv diff --git a/rules/features.smk b/rules/features.smk index 8942beb5..c106912f 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -11,7 +11,7 @@ rule join_features_from_providers: rule phone_data_yield_python_features: input: sensor_data = "data/interim/{pid}/phone_yielded_timestamps_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_DATA_YIELD"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -24,7 +24,7 @@ rule phone_data_yield_python_features: rule phone_data_yield_r_features: input: sensor_data = "data/interim/{pid}/phone_yielded_timestamps_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_DATA_YIELD"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -37,7 +37,7 @@ rule phone_data_yield_r_features: rule phone_accelerometer_python_features: input: sensor_data = "data/raw/{pid}/phone_accelerometer_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_ACCELEROMETER"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -50,7 +50,7 @@ rule phone_accelerometer_python_features: rule phone_accelerometer_r_features: input: sensor_data = "data/raw/{pid}/phone_accelerometer_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_ACCELEROMETER"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -73,7 +73,7 @@ rule activity_recognition_episodes: rule phone_activity_recognition_python_features: input: sensor_episodes = "data/interim/{pid}/phone_activity_recognition_episodes_resampled_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -86,7 +86,7 @@ rule phone_activity_recognition_python_features: rule phone_activity_recognition_r_features: input: sensor_episodes = "data/interim/{pid}/phone_activity_recognition_episodes_resampled_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -99,7 +99,7 @@ rule phone_activity_recognition_r_features: rule phone_applications_foreground_python_features: input: sensor_data = "data/raw/{pid}/phone_applications_foreground_with_datetime_with_categories.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -112,7 +112,7 @@ rule phone_applications_foreground_python_features: rule phone_applications_foreground_r_features: input: sensor_data = "data/raw/{pid}/phone_applications_foreground_with_datetime_with_categories.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -135,7 +135,7 @@ rule battery_episodes: rule phone_battery_python_features: input: sensor_episodes = "data/interim/{pid}/phone_battery_episodes_resampled_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_BATTERY"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -148,7 +148,7 @@ rule phone_battery_python_features: rule phone_battery_r_features: input: sensor_episodes = "data/interim/{pid}/phone_battery_episodes_resampled_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_BATTERY"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -161,7 +161,7 @@ rule phone_battery_r_features: rule phone_bluetooth_python_features: input: sensor_data = "data/raw/{pid}/phone_bluetooth_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_BLUETOOTH"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -174,7 +174,7 @@ rule phone_bluetooth_python_features: rule phone_bluetooth_r_features: input: sensor_data = "data/raw/{pid}/phone_bluetooth_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_BLUETOOTH"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -187,7 +187,7 @@ rule phone_bluetooth_r_features: rule calls_python_features: input: sensor_data = "data/raw/{pid}/phone_calls_with_datetime_unified.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_CALLS"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -200,7 +200,7 @@ rule calls_python_features: rule calls_r_features: input: sensor_data = "data/raw/{pid}/phone_calls_with_datetime_unified.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_CALLS"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -213,7 +213,7 @@ rule calls_r_features: rule conversation_python_features: input: sensor_data = "data/raw/{pid}/phone_conversation_with_datetime_unified.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_CONVERSATION"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -226,7 +226,7 @@ rule conversation_python_features: rule conversation_r_features: input: sensor_data = "data/raw/{pid}/phone_conversation_with_datetime_unified.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_CONVERSATION"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -239,7 +239,7 @@ rule conversation_r_features: rule phone_light_python_features: input: sensor_data = "data/raw/{pid}/phone_light_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_LIGHT"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -252,7 +252,7 @@ rule phone_light_python_features: rule phone_light_r_features: input: sensor_data = "data/raw/{pid}/phone_light_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_LIGHT"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -265,7 +265,7 @@ rule phone_light_r_features: rule phone_locations_python_features: input: sensor_data = "data/interim/{pid}/phone_locations_processed_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_LOCATIONS"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -278,7 +278,7 @@ rule phone_locations_python_features: rule phone_locations_r_features: input: sensor_data = "data/interim/{pid}/phone_locations_processed_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_LOCATIONS"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -291,7 +291,7 @@ rule phone_locations_r_features: rule phone_messages_python_features: input: sensor_data = "data/raw/{pid}/phone_messages_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_MESSAGES"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -304,7 +304,7 @@ rule phone_messages_python_features: rule phone_messages_r_features: input: sensor_data = "data/raw/{pid}/phone_messages_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_MESSAGES"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -325,7 +325,7 @@ rule screen_episodes: rule phone_screen_python_features: input: sensor_episodes = "data/interim/{pid}/phone_screen_episodes_resampled_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_SCREEN"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -338,7 +338,7 @@ rule phone_screen_python_features: rule phone_screen_r_features: input: sensor_episodes = "data/interim/{pid}/phone_screen_episodes_resampled_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_SCREEN"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -351,7 +351,7 @@ rule phone_screen_r_features: rule phone_wifi_connected_python_features: input: sensor_data = "data/raw/{pid}/phone_wifi_connected_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_WIFI_CONNECTED"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -364,7 +364,7 @@ rule phone_wifi_connected_python_features: rule phone_wifi_connected_r_features: input: sensor_data = "data/raw/{pid}/phone_wifi_connected_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_WIFI_CONNECTED"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -377,7 +377,7 @@ rule phone_wifi_connected_r_features: rule phone_wifi_visible_python_features: input: sensor_data = "data/raw/{pid}/phone_wifi_visible_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_WIFI_VISIBLE"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -390,7 +390,7 @@ rule phone_wifi_visible_python_features: rule phone_wifi_visible_r_features: input: sensor_data = "data/raw/{pid}/phone_wifi_visible_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_WIFI_VISIBLE"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -403,7 +403,7 @@ rule phone_wifi_visible_r_features: rule fitbit_heartrate_summary_python_features: input: sensor_data = "data/raw/{pid}/fitbit_heartrate_summary_parsed_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -416,7 +416,7 @@ rule fitbit_heartrate_summary_python_features: rule fitbit_heartrate_summary_r_features: input: sensor_data = "data/raw/{pid}/fitbit_heartrate_summary_parsed_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -429,7 +429,7 @@ rule fitbit_heartrate_summary_r_features: rule fitbit_heartrate_intraday_python_features: input: sensor_data = "data/raw/{pid}/fitbit_heartrate_intraday_parsed_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -442,7 +442,7 @@ rule fitbit_heartrate_intraday_python_features: rule fitbit_heartrate_intraday_r_features: input: sensor_data = "data/raw/{pid}/fitbit_heartrate_intraday_parsed_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -455,7 +455,7 @@ rule fitbit_heartrate_intraday_r_features: rule fitbit_steps_summary_python_features: input: sensor_data = "data/raw/{pid}/fitbit_steps_summary_parsed_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -468,7 +468,7 @@ rule fitbit_steps_summary_python_features: rule fitbit_steps_summary_r_features: input: sensor_data = "data/raw/{pid}/fitbit_steps_summary_parsed_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -481,7 +481,7 @@ rule fitbit_steps_summary_r_features: rule fitbit_steps_intraday_python_features: input: sensor_data = "data/raw/{pid}/fitbit_steps_intraday_parsed_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -494,7 +494,7 @@ rule fitbit_steps_intraday_python_features: rule fitbit_steps_intraday_r_features: input: sensor_data = "data/raw/{pid}/fitbit_steps_intraday_parsed_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -507,7 +507,7 @@ rule fitbit_steps_intraday_r_features: rule fitbit_sleep_summary_python_features: input: sensor_data = "data/raw/{pid}/fitbit_sleep_summary_parsed_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", @@ -520,7 +520,7 @@ rule fitbit_sleep_summary_python_features: rule fitbit_sleep_summary_r_features: input: sensor_data = "data/raw/{pid}/fitbit_sleep_summary_parsed_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", diff --git a/rules/models.smk b/rules/models.smk index acca3b72..1b09663b 100644 --- a/rules/models.smk +++ b/rules/models.smk @@ -34,11 +34,11 @@ rule download_target_data: rule target_readable_datetime: input: sensor_input = "data/raw/{pid}/participant_target_raw.csv", - day_segments = "data/interim/day_segments/{pid}_day_segments.csv" + time_segments = "data/interim/time_segments/{pid}_time_segments.csv" params: fixed_timezone = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["SOURCE"]["TIMEZONE"], - day_segments_type = config["DAY_SEGMENTS"]["TYPE"], - include_past_periodic_segments = config["DAY_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] + time_segments_type = config["TIME_SEGMENTS"]["TYPE"], + include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] output: "data/raw/{pid}/participant_target_with_datetime.csv" script: @@ -47,7 +47,7 @@ rule target_readable_datetime: rule parse_targets: input: targets = "data/raw/{pid}/participant_target_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" output: "data/processed/targets/{pid}/parsed_targets.csv" script: diff --git a/rules/preprocessing.smk b/rules/preprocessing.smk index 7b03b18e..82c65ede 100644 --- a/rules/preprocessing.smk +++ b/rules/preprocessing.smk @@ -50,28 +50,28 @@ rule download_fitbit_data: script: "../src/data/download_fitbit_data.R" -rule compute_day_segments: +rule compute_time_segments: input: - config["DAY_SEGMENTS"]["FILE"], + config["TIME_SEGMENTS"]["FILE"], "data/external/participant_files/{pid}.yaml" params: - day_segments_type = config["DAY_SEGMENTS"]["TYPE"], + time_segments_type = config["TIME_SEGMENTS"]["TYPE"], pid = "{pid}" output: - segments_file = "data/interim/day_segments/{pid}_day_segments.csv", - segments_labels_file = "data/interim/day_segments/{pid}_day_segments_labels.csv", + segments_file = "data/interim/time_segments/{pid}_time_segments.csv", + segments_labels_file = "data/interim/time_segments/{pid}_time_segments_labels.csv", script: - "../src/data/compute_day_segments.py" + "../src/data/compute_time_segments.py" rule phone_readable_datetime: input: sensor_input = "data/raw/{pid}/phone_{sensor}_raw.csv", - day_segments = "data/interim/day_segments/{pid}_day_segments.csv" + time_segments = "data/interim/time_segments/{pid}_time_segments.csv" params: timezones = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["TYPE"], fixed_timezone = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"], - day_segments_type = config["DAY_SEGMENTS"]["TYPE"], - include_past_periodic_segments = config["DAY_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] + time_segments_type = config["TIME_SEGMENTS"]["TYPE"], + include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] output: "data/raw/{pid}/phone_{sensor}_with_datetime.csv" script: @@ -90,12 +90,12 @@ rule phone_yielded_timestamps: rule phone_yielded_timestamps_with_datetime: input: sensor_input = "data/interim/{pid}/phone_yielded_timestamps.csv", - day_segments = "data/interim/day_segments/{pid}_day_segments.csv" + time_segments = "data/interim/time_segments/{pid}_time_segments.csv" params: timezones = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["TYPE"], fixed_timezone = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"], - day_segments_type = config["DAY_SEGMENTS"]["TYPE"], - include_past_periodic_segments = config["DAY_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] + time_segments_type = config["TIME_SEGMENTS"]["TYPE"], + include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] output: "data/interim/{pid}/phone_yielded_timestamps_with_datetime.csv" script: @@ -128,12 +128,12 @@ rule process_phone_locations_types: rule phone_locations_processed_with_datetime: input: sensor_input = "data/interim/{pid}/phone_locations_processed.csv", - day_segments = "data/interim/day_segments/{pid}_day_segments.csv" + time_segments = "data/interim/time_segments/{pid}_time_segments.csv" params: timezones = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["TYPE"], fixed_timezone = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"], - day_segments_type = config["DAY_SEGMENTS"]["TYPE"], - include_past_periodic_segments = config["DAY_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] + time_segments_type = config["TIME_SEGMENTS"]["TYPE"], + include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] output: "data/interim/{pid}/phone_locations_processed_with_datetime.csv" script: @@ -150,12 +150,12 @@ rule resample_episodes: rule resample_episodes_with_datetime: input: sensor_input = "data/interim/{pid}/{sensor}_episodes_resampled.csv", - day_segments = "data/interim/day_segments/{pid}_day_segments.csv" + time_segments = "data/interim/time_segments/{pid}_time_segments.csv" params: timezones = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["TYPE"], fixed_timezone = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"], - day_segments_type = config["DAY_SEGMENTS"]["TYPE"], - include_past_periodic_segments = config["DAY_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] + time_segments_type = config["TIME_SEGMENTS"]["TYPE"], + include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] output: "data/interim/{pid}/{sensor}_episodes_resampled_with_datetime.csv" script: @@ -233,11 +233,11 @@ rule fitbit_parse_sleep: rule fitbit_readable_datetime: input: sensor_input = "data/raw/{pid}/fitbit_{sensor}_{fitbit_data_type}_parsed.csv", - day_segments = "data/interim/day_segments/{pid}_day_segments.csv" + time_segments = "data/interim/time_segments/{pid}_time_segments.csv" params: fixed_timezone = config["FITBIT_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"], - day_segments_type = config["DAY_SEGMENTS"]["TYPE"], - include_past_periodic_segments = config["DAY_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] + time_segments_type = config["TIME_SEGMENTS"]["TYPE"], + include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] output: "data/raw/{pid}/fitbit_{sensor}_{fitbit_data_type}_parsed_with_datetime.csv" script: diff --git a/rules/reports.smk b/rules/reports.smk index e18e9ddc..6071db09 100644 --- a/rules/reports.smk +++ b/rules/reports.smk @@ -11,7 +11,7 @@ rule histogram_phone_data_yield: rule heatmap_features_correlations: input: - features = expand("data/processed/{pid}/{sensor}_{day_segment}.csv", pid=config["PIDS"], sensor=config["HEATMAP_FEATURES_CORRELATIONS"]["PHONE_FEATURES"]+config["HEATMAP_FEATURES_CORRELATIONS"]["FITBIT_FEATURES"], day_segment=config["DAY_SEGMENTS"]), + features = expand("data/processed/{pid}/{sensor}_{time_segment}.csv", pid=config["PIDS"], sensor=config["HEATMAP_FEATURES_CORRELATIONS"]["PHONE_FEATURES"]+config["HEATMAP_FEATURES_CORRELATIONS"]["FITBIT_FEATURES"], time_segment=config["TIME_SEGMENTS"]), phone_valid_sensed_days = expand("data/interim/{pid}/phone_valid_sensed_days_{{min_valid_hours_per_day}}hours_{{min_valid_bins_per_hour}}bins.csv", pid=config["PIDS"]) params: min_rows_ratio = config["HEATMAP_FEATURES_CORRELATIONS"]["MIN_ROWS_RATIO"], diff --git a/sn_profile_rapids/Snakefile b/sn_profile_rapids/Snakefile index b025be6b..18a469a5 100644 --- a/sn_profile_rapids/Snakefile +++ b/sn_profile_rapids/Snakefile @@ -32,13 +32,13 @@ if config["PHONE_VALID_SENSED_DAYS"]["COMPUTE"]: if config["MESSAGES"]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["MESSAGES"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["MESSAGES"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/messages_{messages_type}_{day_segment}.csv", pid=config["PIDS"], messages_type = config["MESSAGES"]["TYPES"], day_segment = config["MESSAGES"]["DAY_SEGMENTS"])) + files_to_compute.extend(expand("data/processed/{pid}/messages_{messages_type}_{time_segment}.csv", pid=config["PIDS"], messages_type = config["MESSAGES"]["TYPES"], time_segment = config["MESSAGES"]["TIME_SEGMENTS"])) if config["CALLS"]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/calls_{call_type}_{day_segment}.csv", pid=config["PIDS"], call_type=config["CALLS"]["TYPES"], day_segment = config["CALLS"]["DAY_SEGMENTS"])) + files_to_compute.extend(expand("data/processed/{pid}/calls_{call_type}_{time_segment}.csv", pid=config["PIDS"], call_type=config["CALLS"]["TYPES"], time_segment = config["CALLS"]["TIME_SEGMENTS"])) if config["BARNETT_LOCATION"]["COMPUTE"]: # TODO add files_to_compute.extend(optional_location_input(None)) @@ -49,24 +49,24 @@ if config["BARNETT_LOCATION"]["COMPUTE"]: raise ValueError("Error: Add your locations table (and as many sensor tables as you have) to [PHONE_VALID_SENSED_BINS][TABLES] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)") files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["BARNETT_LOCATION"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["BARNETT_LOCATION"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/location_barnett_{day_segment}.csv", pid=config["PIDS"], day_segment = config["BARNETT_LOCATION"]["DAY_SEGMENTS"])) + files_to_compute.extend(expand("data/processed/{pid}/location_barnett_{time_segment}.csv", pid=config["PIDS"], time_segment = config["BARNETT_LOCATION"]["TIME_SEGMENTS"])) if config["BLUETOOTH"]["COMPUTE"]: - files_to_compute.extend(expand("data/interim/{sensor}_day_segments.csv", sensor=config["BLUETOOTH"]["DB_TABLE"])) + files_to_compute.extend(expand("data/interim/{sensor}_time_segments.csv", sensor=config["BLUETOOTH"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["BLUETOOTH"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["BLUETOOTH"]["DB_TABLE"])) files_to_compute.extend(expand("data/processed/{pid}/bluetooth_features.csv", pid=config["PIDS"] )) if config["ACTIVITY_RECOGNITION"]["COMPUTE"]: # TODO add files_to_compute.extend(optional_ar_input(None)), the Android or iOS table gets processed depending on each participant - files_to_compute.extend(expand("data/processed/{pid}/activity_recognition_{day_segment}.csv",pid=config["PIDS"], day_segment = config["ACTIVITY_RECOGNITION"]["DAY_SEGMENTS"])) + files_to_compute.extend(expand("data/processed/{pid}/activity_recognition_{time_segment}.csv",pid=config["PIDS"], time_segment = config["ACTIVITY_RECOGNITION"]["TIME_SEGMENTS"])) if config["BATTERY"]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["BATTERY"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["BATTERY"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["BATTERY"]["DB_TABLE"])) files_to_compute.extend(expand("data/processed/{pid}/battery_deltas.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/processed/{pid}/battery_{day_segment}.csv", pid = config["PIDS"], day_segment = config["BATTERY"]["DAY_SEGMENTS"])) + files_to_compute.extend(expand("data/processed/{pid}/battery_{time_segment}.csv", pid = config["PIDS"], time_segment = config["BATTERY"]["TIME_SEGMENTS"])) if config["SCREEN"]["COMPUTE"]: if config["SCREEN"]["DB_TABLE"] in config["PHONE_VALID_SENSED_BINS"]["TABLES"]: @@ -77,50 +77,50 @@ if config["SCREEN"]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["SCREEN"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["SCREEN"]["DB_TABLE"])) files_to_compute.extend(expand("data/processed/{pid}/screen_deltas.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/processed/{pid}/screen_{day_segment}.csv", pid = config["PIDS"], day_segment = config["SCREEN"]["DAY_SEGMENTS"])) + files_to_compute.extend(expand("data/processed/{pid}/screen_{time_segment}.csv", pid = config["PIDS"], time_segment = config["SCREEN"]["TIME_SEGMENTS"])) if config["LIGHT"]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["LIGHT"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["LIGHT"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/light_{day_segment}.csv", pid = config["PIDS"], day_segment = config["LIGHT"]["DAY_SEGMENTS"])) + files_to_compute.extend(expand("data/processed/{pid}/light_{time_segment}.csv", pid = config["PIDS"], time_segment = config["LIGHT"]["TIME_SEGMENTS"])) if config["ACCELEROMETER"]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["ACCELEROMETER"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["ACCELEROMETER"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/accelerometer_{day_segment}.csv", pid = config["PIDS"], day_segment = config["ACCELEROMETER"]["DAY_SEGMENTS"])) + files_to_compute.extend(expand("data/processed/{pid}/accelerometer_{time_segment}.csv", pid = config["PIDS"], time_segment = config["ACCELEROMETER"]["TIME_SEGMENTS"])) if config["APPLICATIONS_FOREGROUND"]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])) files_to_compute.extend(expand("data/interim/{pid}/{sensor}_with_datetime_with_genre.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/applications_foreground_{day_segment}.csv", pid = config["PIDS"], day_segment = config["APPLICATIONS_FOREGROUND"]["DAY_SEGMENTS"])) + files_to_compute.extend(expand("data/processed/{pid}/applications_foreground_{time_segment}.csv", pid = config["PIDS"], time_segment = config["APPLICATIONS_FOREGROUND"]["TIME_SEGMENTS"])) if config["WIFI"]["COMPUTE"]: - files_to_compute.extend(expand("data/interim/{sensor}_day_segments.csv", sensor=config["WIFI"]["DB_TABLE"])) + files_to_compute.extend(expand("data/interim/{sensor}_time_segments.csv", sensor=config["WIFI"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["WIFI"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["WIFI"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/wifi_features.csv", pid = config["PIDS"], day_segment = config["WIFI"]["DAY_SEGMENTS"])) + files_to_compute.extend(expand("data/processed/{pid}/wifi_features.csv", pid = config["PIDS"], time_segment = config["WIFI"]["TIME_SEGMENTS"])) if config["HEARTRATE"]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["HEARTRATE"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_{fitbit_data_type}_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["summary", "intraday"])) - files_to_compute.extend(expand("data/processed/{pid}/fitbit_heartrate_{day_segment}.csv", pid = config["PIDS"], day_segment = config["HEARTRATE"]["DAY_SEGMENTS"])) + files_to_compute.extend(expand("data/processed/{pid}/fitbit_heartrate_{time_segment}.csv", pid = config["PIDS"], time_segment = config["HEARTRATE"]["TIME_SEGMENTS"])) if config["STEP"]["COMPUTE"]: if config["STEP"]["EXCLUDE_SLEEP"]["EXCLUDE"] == True and config["STEP"]["EXCLUDE_SLEEP"]["TYPE"] == "FITBIT_BASED": files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_{fitbit_data_type}_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["summary"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["STEP"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/fitbit_step_{fitbit_data_type}_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["intraday"])) - files_to_compute.extend(expand("data/processed/{pid}/fitbit_step_{day_segment}.csv", pid = config["PIDS"], day_segment = config["STEP"]["DAY_SEGMENTS"])) + files_to_compute.extend(expand("data/processed/{pid}/fitbit_step_{time_segment}.csv", pid = config["PIDS"], time_segment = config["STEP"]["TIME_SEGMENTS"])) if config["SLEEP"]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["SLEEP"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_{fitbit_data_type}_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["intraday", "summary"])) - files_to_compute.extend(expand("data/processed/{pid}/fitbit_sleep_{day_segment}.csv", pid = config["PIDS"], day_segment = config["SLEEP"]["DAY_SEGMENTS"])) + files_to_compute.extend(expand("data/processed/{pid}/fitbit_sleep_{time_segment}.csv", pid = config["PIDS"], time_segment = config["SLEEP"]["TIME_SEGMENTS"])) if config["CONVERSATION"]["COMPUTE"]: # TODO add files_to_compute.extend(optional_conversation_input(None)), the Android or iOS table gets processed depending on each participant - files_to_compute.extend(expand("data/processed/{pid}/conversation_{day_segment}.csv",pid=config["PIDS"], day_segment = config["CONVERSATION"]["DAY_SEGMENTS"])) + files_to_compute.extend(expand("data/processed/{pid}/conversation_{time_segment}.csv",pid=config["PIDS"], time_segment = config["CONVERSATION"]["TIME_SEGMENTS"])) if config["DORYAB_LOCATION"]["COMPUTE"]: if config["DORYAB_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED": @@ -130,7 +130,7 @@ if config["DORYAB_LOCATION"]["COMPUTE"]: raise ValueError("Error: Add your locations table (and as many sensor tables as you have) to [PHONE_VALID_SENSED_BINS][TABLES] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)") files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["DORYAB_LOCATION"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["DORYAB_LOCATION"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/location_doryab_{segment}.csv", pid=config["PIDS"], segment = config["DORYAB_LOCATION"]["DAY_SEGMENTS"])) + files_to_compute.extend(expand("data/processed/{pid}/location_doryab_{segment}.csv", pid=config["PIDS"], segment = config["DORYAB_LOCATION"]["TIME_SEGMENTS"])) if config["PARAMS_FOR_ANALYSIS"]["COMPUTE"]: rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"] @@ -143,31 +143,31 @@ if config["PARAMS_FOR_ANALYSIS"]["COMPUTE"]: cols_nan_thresholds = cols_nan_thresholds + list(itertools.chain.from_iterable([threshold] * len(config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name]) for threshold in cols_nan_threshold)) results = config["PARAMS_FOR_ANALYSIS"]["RESULT_COMPONENTS"] + ["merged_population_model_results"] - files_to_compute.extend(expand("data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_original.csv", + files_to_compute.extend(expand("data/processed/{pid}/data_for_individual_model/{source}_{time_segment}_original.csv", pid = config["PIDS"], source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], - day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"])) - files_to_compute.extend(expand("data/processed/data_for_population_model/{source}_{day_segment}_original.csv", + time_segment = config["PARAMS_FOR_ANALYSIS"]["TIME_SEGMENTS"])) + files_to_compute.extend(expand("data/processed/data_for_population_model/{source}_{time_segment}_original.csv", source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], - day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"])) + time_segment = config["PARAMS_FOR_ANALYSIS"]["TIME_SEGMENTS"])) files_to_compute.extend(expand( - expand("data/processed/{pid}/data_for_individual_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv", + expand("data/processed/{pid}/data_for_individual_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{time_segment}_clean.csv", pid = config["PIDS"], days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"], cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], - day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), + time_segment = config["PARAMS_FOR_ANALYSIS"]["TIME_SEGMENTS"]), zip, rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"])) files_to_compute.extend(expand( - expand("data/processed/data_for_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv", + expand("data/processed/data_for_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{time_segment}_clean.csv", days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"], cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], - day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), + time_segment = config["PARAMS_FOR_ANALYSIS"]["TIME_SEGMENTS"]), zip, rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"])) @@ -175,46 +175,46 @@ if config["PARAMS_FOR_ANALYSIS"]["COMPUTE"]: files_to_compute.extend(expand("data/processed/data_for_population_model/targets_{summarised}.csv", summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"])) files_to_compute.extend(expand( - expand("data/processed/data_for_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv", + expand("data/processed/data_for_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{time_segment}_nancellsratio.csv", days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"], cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], - day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), + time_segment = config["PARAMS_FOR_ANALYSIS"]["TIME_SEGMENTS"]), zip, rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"])) files_to_compute.extend(expand( - expand("data/processed/data_for_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}.csv", + expand("data/processed/data_for_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{time_segment}_{summarised}.csv", days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"], cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], - day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"], + time_segment = config["PARAMS_FOR_ANALYSIS"]["TIME_SEGMENTS"], summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]), zip, rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"])) files_to_compute.extend(expand( - expand("data/processed/output_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}_{cv_method}_baseline.csv", + expand("data/processed/output_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{time_segment}_{summarised}_{cv_method}_baseline.csv", days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"], cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], cv_method = config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"], source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], - day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"], + time_segment = config["PARAMS_FOR_ANALYSIS"]["TIME_SEGMENTS"], summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]), zip, rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"])) files_to_compute.extend(expand( - expand("data/processed/output_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{{model}}/{cv_method}/{source}_{day_segment}_{summarised}_{{scaler}}/{result}.csv", + expand("data/processed/output_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{{model}}/{cv_method}/{source}_{time_segment}_{summarised}_{{scaler}}/{result}.csv", days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"], cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], cv_method = config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"], source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], - day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"], + time_segment = config["PARAMS_FOR_ANALYSIS"]["TIME_SEGMENTS"], summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"], result = results), zip, diff --git a/sn_profile_rapids/config.yaml b/sn_profile_rapids/config.yaml index 29ee17ae..a68742c2 100644 --- a/sn_profile_rapids/config.yaml +++ b/sn_profile_rapids/config.yaml @@ -2,4 +2,4 @@ configfile: ./sn_profile_rapids/pipeline_config.yaml directory: ./ snakefile: ./sn_profile_rapids/Snakefile cores: 1 -# forcerun: compute_day_segments \ No newline at end of file +# forcerun: compute_time_segments \ No newline at end of file diff --git a/sn_profile_rapids/pipeline_config.yaml b/sn_profile_rapids/pipeline_config.yaml index cf070db6..cbacf8e2 100644 --- a/sn_profile_rapids/pipeline_config.yaml +++ b/sn_profile_rapids/pipeline_config.yaml @@ -3,6 +3,6 @@ DOWNLOAD_DATASET: GROUP: RAPIDS BLUETOOTH: COMPUTE: True - DAY_SEGMENTS: "data/external/daysegments_bluetooth.csv" + TIME_SEGMENTS: "data/external/timesegments_bluetooth.csv" WIFI: COMPUTE: True \ No newline at end of file diff --git a/src/data/assign_to_day_segment.R b/src/data/assign_to_time_segment.R similarity index 84% rename from src/data/assign_to_day_segment.R rename to src/data/assign_to_time_segment.R index 18dab2df..43df1f12 100644 --- a/src/data/assign_to_day_segment.R +++ b/src/data/assign_to_time_segment.R @@ -3,7 +3,7 @@ library("lubridate", warn.conflicts = F) options(scipen=999) day_type_delay <- function(day_type, include_past_periodic_segments){ - delay <- day_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == day_type) %>% arrange(-length_duration) %>% pull(length_duration) %>% first() + delay <- time_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == day_type) %>% arrange(-length_duration) %>% pull(length_duration) %>% first() return(if_else(is.na(delay) | include_past_periodic_segments == FALSE, duration("0days"), delay)) } @@ -27,10 +27,10 @@ get_segment_dates <- function(data, local_timezone, day_type, delay){ return(dates) } -assign_rows_to_segments <- function(nested_data, nested_inferred_day_segments){ +assign_rows_to_segments <- function(nested_data, nested_inferred_time_segments){ nested_data <- nested_data %>% mutate(assigned_segments = "") - for(i in 1:nrow(nested_inferred_day_segments)) { - segment <- nested_inferred_day_segments[i,] + for(i in 1:nrow(nested_inferred_time_segments)) { + segment <- nested_inferred_time_segments[i,] nested_data$assigned_segments <- ifelse(segment$segment_start_ts<= nested_data$timestamp & segment$segment_end_ts >= nested_data$timestamp, stringi::stri_c(nested_data$assigned_segments, segment$segment_id, sep = "|"), nested_data$assigned_segments) } @@ -38,9 +38,9 @@ assign_rows_to_segments <- function(nested_data, nested_inferred_day_segments){ return(nested_data) } -assign_rows_to_segments_frequency <- function(nested_data, nested_timezone, day_segments){ - for(i in 1:nrow(day_segments)) { - segment <- day_segments[i,] +assign_rows_to_segments_frequency <- function(nested_data, nested_timezone, time_segments){ + for(i in 1:nrow(time_segments)) { + segment <- time_segments[i,] nested_data$assigned_segments <- ifelse(segment$segment_start_ts<= nested_data$local_time_obj & segment$segment_end_ts >= nested_data$local_time_obj, # The segment_id is assambled on the fly because it depends on each row's local_date and timezone stringi::stri_c("[", @@ -57,14 +57,14 @@ assign_rows_to_segments_frequency <- function(nested_data, nested_timezone, day_ return(nested_data) } -assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type, include_past_periodic_segments){ +assign_to_time_segment <- function(sensor_data, time_segments, time_segments_type, include_past_periodic_segments){ - if(nrow(sensor_data) == 0 || nrow(day_segments) == 0) + if(nrow(sensor_data) == 0 || nrow(time_segments) == 0) return(sensor_data %>% mutate(assigned_segments = NA)) - if(day_segments_type == "FREQUENCY"){ + if(time_segments_type == "FREQUENCY"){ - day_segments <- day_segments %>% mutate(start_time = lubridate::hm(start_time), + time_segments <- time_segments %>% mutate(start_time = lubridate::hm(start_time), end_time = start_time + minutes(length) - seconds(1), segment_id_start_time = paste(str_pad(hour(start_time),2, pad="0"), str_pad(minute(start_time),2, pad="0"), str_pad(second(start_time),2, pad="0"),sep =":"), segment_id_end_time = paste(str_pad(hour(ymd("1970-01-01") + end_time),2, pad="0"), str_pad(minute(ymd("1970-01-01") + end_time),2, pad="0"), str_pad(second(ymd("1970-01-01") + end_time),2, pad="0"),sep =":"), # add ymd("1970-01-01") to get a real time instead of duration @@ -77,7 +77,7 @@ assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type, sensor_data <- sensor_data %>% group_by(local_timezone) %>% nest() %>% - mutate(data = map2(data, local_timezone, assign_rows_to_segments_frequency, day_segments)) %>% + mutate(data = map2(data, local_timezone, assign_rows_to_segments_frequency, time_segments)) %>% unnest(cols = data) %>% arrange(timestamp) %>% select(-local_time_obj) @@ -85,10 +85,10 @@ assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type, return(sensor_data) - } else if (day_segments_type == "PERIODIC"){ + } else if (time_segments_type == "PERIODIC"){ # We need to take into account segment start dates that could include the first day of data - day_segments <- day_segments %>% mutate(length_duration = duration(length)) + time_segments <- time_segments %>% mutate(length_duration = duration(length)) every_day_delay <- duration("0days") wday_delay <- day_type_delay("wday", include_past_periodic_segments) mday_delay <- day_type_delay("mday", include_past_periodic_segments) @@ -106,9 +106,9 @@ assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type, year_dates = map2(data, local_timezone, get_segment_dates, "yday", yday_delay), existent_dates = pmap(list(every_date, week_dates, month_dates, quarter_dates, year_dates), function(every_date, week_dates, month_dates, quarter_dates, year_dates) reduce(list(every_date, week_dates,month_dates, quarter_dates, year_dates), .f=full_join)), - # build the actual day segments taking into account the users requested length and repeat schedule - inferred_day_segments = map(existent_dates, - ~ crossing(day_segments, .x) %>% + # build the actual time segments taking into account the users requested length and repeat schedule + inferred_time_segments = map(existent_dates, + ~ crossing(time_segments, .x) %>% pivot_longer(cols = c(every_day,wday, mday, qday, yday), names_to = "day_type", values_to = "day_value") %>% filter(repeats_on == day_type & repeats_value == day_value) %>% # The segment ids (segment_id_start and segment_id_end) are computed in UTC to avoid having different labels for instances of a segment that happen in different timezones @@ -125,21 +125,21 @@ assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type, paste(str_pad(hour(segment_id_end),2, pad="0"), str_pad(minute(segment_id_end),2, pad="0"), str_pad(second(segment_id_end),2, pad="0"),sep =":")),";", paste0(segment_start_ts, ",", segment_end_ts)), "]")) %>% - # drop day segments with an invalid start or end time (mostly due to daylight saving changes, e.g. 2020-03-08 02:00:00 EST does not exist, clock jumps from 01:59am to 03:00am) + # drop time segments with an invalid start or end time (mostly due to daylight saving changes, e.g. 2020-03-08 02:00:00 EST does not exist, clock jumps from 01:59am to 03:00am) drop_na(segment_start_ts, segment_end_ts)), - data = map2(data, inferred_day_segments, assign_rows_to_segments) + data = map2(data, inferred_time_segments, assign_rows_to_segments) ) %>% - select(-existent_dates, -inferred_day_segments, -every_date, -week_dates, -month_dates, -quarter_dates, -year_dates) %>% + select(-existent_dates, -inferred_time_segments, -every_date, -week_dates, -month_dates, -quarter_dates, -year_dates) %>% unnest(cols = data) %>% arrange(timestamp) - } else if ( day_segments_type == "EVENT"){ + } else if ( time_segments_type == "EVENT"){ sensor_data <- sensor_data %>% group_by(local_timezone) %>% nest() %>% - mutate(inferred_day_segments = map(local_timezone, function(tz){ - inferred <- day_segments %>% + mutate(inferred_time_segments = map(local_timezone, function(tz){ + inferred <- time_segments %>% mutate(shift = ifelse(shift == "0", "0seconds", shift), segment_start_ts = event_timestamp + (as.integer(seconds(lubridate::duration(shift))) * ifelse(shift_direction >= 0, 1, -1) * 1000), segment_end_ts = segment_start_ts + (as.integer(seconds(lubridate::duration(length))) * 1000), @@ -161,12 +161,12 @@ assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type, overlapping_segments = paste(paste(lag(label), lag(event_timestamp), lag(length), lag(shift), lag(shift_direction), lag(device_id), sep = ","),"and", paste(label, event_timestamp, length, shift, shift_direction, device_id, sep = ","))) if(any(overlapping$overlaps, na.rm = TRUE)){ - stop(paste0("\n\nOne or more event day segments overlap for ",overlapping$device_id[[1]],", modify their lengths so they don't:\n", paste0(overlapping %>% filter(overlaps == TRUE) %>% pull(overlapping_segments), collapse = "\n"), "\n\n")) + stop(paste0("\n\nOne or more event time segments overlap for ",overlapping$device_id[[1]],", modify their lengths so they don't:\n", paste0(overlapping %>% filter(overlaps == TRUE) %>% pull(overlapping_segments), collapse = "\n"), "\n\n")) } else{ return(inferred) }}), - data = map2(data, inferred_day_segments, assign_rows_to_segments)) %>% - select(-inferred_day_segments) %>% + data = map2(data, inferred_time_segments, assign_rows_to_segments)) %>% + select(-inferred_time_segments) %>% unnest(data) %>% arrange(timestamp) } diff --git a/src/data/compute_day_segments.py b/src/data/compute_day_segments.py deleted file mode 100644 index 8705003c..00000000 --- a/src/data/compute_day_segments.py +++ /dev/null @@ -1,216 +0,0 @@ -import pandas as pd -import warnings -import yaml - -def is_valid_frequency_segments(day_segments, day_segments_file): - """ - returns true if day_segment has the expected structure for generating frequency segments; - raises ValueError exception otherwise. - """ - - valid_columns = ["label", "length"] - if set(day_segments.columns) != set(valid_columns): - error_message = 'The FREQUENCY day segments file in [DAY_SEGMENTS][FILE] must have two columns: label, and length ' \ - 'but instead we found {}. Modify {}'.format(list(day_segments.columns), day_segments_file) - raise ValueError(error_message) - - if day_segments.shape[0] > 1: - message = 'The FREQUENCY day segments file in [DAY_SEGMENTS][FILE] can only have 1 row.' \ - 'Modify {}'.format(day_segments_file) - raise ValueError(message) - - if not pd.api.types.is_integer_dtype(day_segments.dtypes['length']): - message = 'The column length in the FREQUENCY day segments file in [DAY_SEGMENTS][FILE] must be integer but instead is ' \ - '{}. . This usually means that not all values in this column are formed by digits. Modify {}'.format(day_segments.dtypes['length'], day_segments_file) - raise ValueError(message) - - if day_segments.iloc[0].loc['length'] < 0: - message = 'The value in column length in the FREQUENCY day segments file in [DAY_SEGMENTS][FILE] must be positive but instead is ' \ - '{}. Modify {}'.format(day_segments.iloc[0].loc['length'], day_segments_file) - raise ValueError(message) - if day_segments.iloc[0].loc['length'] >= 1440: - message = 'The column length in the FREQUENCY day segments file in [DAY_SEGMENTS][FILE] must be shorter than a day in minutes (1440) but instead is ' \ - '{}. Modify {}'.format(day_segments.iloc[0].loc['length'], day_segments_file) - raise ValueError(message) - - return True - -def is_valid_periodic_segments(day_segments, day_segments_file): - day_segments = day_segments.copy(deep=True) - - valid_columns = ["label", "start_time", "length", "repeats_on", "repeats_value"] - if set(day_segments.columns) != set(valid_columns): - error_message = 'The PERIODIC day segments file in [DAY_SEGMENTS][FILE] must have five columns: label, start_time, length, repeats_on, repeats_value ' \ - 'but instead we found {}. Modify {}'.format(list(day_segments.columns), day_segments_file) - raise ValueError(error_message) - - valid_repeats_on = ["every_day", "wday", "mday", "qday", "yday"] - if len(list(set(day_segments["repeats_on"]) - set(valid_repeats_on))) > 0: - error_message = 'The column repeats_on in the PERIODIC day segments file in [DAY_SEGMENTS][FILE] can only accept: "every_day", "wday", "mday", "qday", or "yday" ' \ - 'but instead we found {}. Modify {}'.format(list(set(day_segments["repeats_on"])), day_segments_file) - raise ValueError(error_message) - - if not pd.api.types.is_integer_dtype(day_segments.dtypes['repeats_value']): - message = 'The column repeats_value in the PERIODIC day segments file in [DAY_SEGMENTS][FILE] must be integer but instead is ' \ - '{}. . This usually means that not all values in this column are formed by digits. Modify {}'.format(day_segments.dtypes['repeats_value'], day_segments_file) - raise ValueError(message) - - invalid_day_segments = day_segments.query("repeats_on == 'every_day' and repeats_value != 0") - if invalid_day_segments.shape[0] > 0: - message = 'Every row with repeats_on=every_day must have a repeats_value=0 in the PERIODIC day segments file in [DAY_SEGMENTS][FILE].' \ - ' Modify row(s) of segment(s) {} of {}'.format(invalid_day_segments["label"].to_numpy(), day_segments_file) - raise ValueError(message) - - invalid_day_segments = day_segments.query("repeats_on == 'wday' and (repeats_value < 1 | repeats_value > 7)") - if invalid_day_segments.shape[0] > 0: - message = 'Every row with repeats_on=wday must have a repeats_value=[1,7] in the PERIODIC day segments file in [DAY_SEGMENTS][FILE].' \ - ' Modify row(s) of segment(s) {} of {}'.format(invalid_day_segments["label"].to_numpy(), day_segments_file) - raise ValueError(message) - - invalid_day_segments = day_segments.query("repeats_on == 'mday' and (repeats_value < 1 | repeats_value > 31)") - if invalid_day_segments.shape[0] > 0: - message = 'Every row with repeats_on=mday must have a repeats_value=[1,31] in the PERIODIC day segments file in [DAY_SEGMENTS][FILE].' \ - ' Modify row(s) of segment(s) {} of {}'.format(invalid_day_segments["label"].to_numpy(), day_segments_file) - raise ValueError(message) - - invalid_day_segments = day_segments.query("repeats_on == 'qday' and (repeats_value < 1 | repeats_value > 92)") - if invalid_day_segments.shape[0] > 0: - message = 'Every row with repeats_on=qday must have a repeats_value=[1,92] in the PERIODIC day segments file in [DAY_SEGMENTS][FILE].' \ - ' Modify row(s) of segment(s) {} of {}'.format(invalid_day_segments["label"].to_numpy(), day_segments_file) - raise ValueError(message) - - invalid_day_segments = day_segments.query("repeats_on == 'yday' and (repeats_value < 1 | repeats_value > 366)") - if invalid_day_segments.shape[0] > 0: - message = 'Every row with repeats_on=yday must have a repeats_value=[1,366] in the PERIODIC day segments file in [DAY_SEGMENTS][FILE].' \ - ' Modify row(s) of segment(s) {} of {}'.format(invalid_day_segments["label"].to_numpy(), day_segments_file) - raise ValueError(message) - - try: - day_segments["start_time"] = pd.to_datetime(day_segments["start_time"]) - except ValueError as err: - raise ValueError("At least one start_time in the PERIODIC day segments file in [DAY_SEGMENTS][FILE] has an invalid format, it should be HH:MM:SS in 24hr clock({}). Modify {}".format(err, day_segments_file)) - - if(day_segments.shape[0] != day_segments.drop_duplicates().shape[0]): - error_message = 'The PERIODIC day segments file in [DAY_SEGMENTS][FILE] has two or more rows that are identical. ' \ - 'Modify {}'.format(day_segments_file) - raise ValueError(error_message) - - duplicated_labels = day_segments[day_segments["label"].duplicated()] - if(duplicated_labels.shape[0] > 0): - error_message = 'Segements labels must be unique. The PERIODIC day segments file in [DAY_SEGMENTS][FILE] has {} row(s) with the same label {}. ' \ - 'Modify {}'.format(duplicated_labels.shape[0], duplicated_labels["label"].to_numpy(), day_segments_file) - raise ValueError(error_message) - - # TODO Validate string format for lubridate - - return True - -def is_valid_event_segments(day_segments, day_segments_file): - day_segments = day_segments.copy(deep=True) - - valid_columns = ["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"] - if set(day_segments.columns) != set(valid_columns): - error_message = 'The EVENT day segments file in [DAY_SEGMENTS][FILE] must have six columns: label, event_timestamp, length, shift, shift_direction and device_id ' \ - 'but instead we found {}. Modify {}'.format(list(day_segments.columns), day_segments_file) - raise ValueError(error_message) - - if not pd.api.types.is_integer_dtype(day_segments.dtypes['event_timestamp']): - message = 'The column event_timestamp in the EVENT day segments file in [DAY_SEGMENTS][FILE] must be integer but instead is ' \ - '{}. This usually means that not all values in this column are formed by digits. Modify {}'.format(day_segments.dtypes['event_timestamp'], day_segments_file) - raise ValueError(message) - - valid_shift_direction_values = [1, -1, 0] - provided_values = day_segments["shift_direction"].unique() - if len(list(set(provided_values) - set(valid_shift_direction_values))) > 0: - error_message = 'The values of shift_direction column in the EVENT day segments file in [DAY_SEGMENTS][FILE] can only be 1, -1 or 0 ' \ - 'but instead we found {}. Modify {}'.format(provided_values, day_segments_file) - raise ValueError(error_message) - - if(day_segments.shape[0] != day_segments.drop_duplicates().shape[0]): - error_message = 'The EVENT day segments file in [DAY_SEGMENTS][FILE] has two or more rows that are identical. ' \ - 'Modify {}'.format(day_segments_file) - raise ValueError(error_message) - - # TODO Validate string format for lubridate of length and shift - # TODO validate unique labels per participant - return True - - -def parse_frequency_segments(day_segments: pd.DataFrame) -> pd.DataFrame: - """ - returns a table with rows identifying start and end of time slots with frequency freq (in minutes). For example, - for freq = 10 it outputs: - bin_id start end label - 0 00:00 00:10 epoch_0000 - 1 00:10 00:20 epoch_0001 - 2 00:20 00:30 epoch_0002 - ... - 143 23:50 00:00 epoch_0143 - day_segments argument is expected to have the following structure: - label length - epoch 10 - """ - freq = day_segments.iloc[0].loc['length'] - slots = pd.date_range(start='2020-01-01', end='2020-01-02', freq='{}min'.format(freq)) - slots = ['{:02d}:{:02d}'.format(x.hour, x.minute) for x in slots] - - table = pd.DataFrame(slots, columns=['start_time']) - table['length'] = day_segments.iloc[0].loc['length'] - table = table.iloc[:-1, :] - - label = day_segments.loc[0, 'label'] - table['label'] = range(0, table.shape[0]) - table['label'] = table['label'].apply(lambda x: '{}{:04}'.format(label, x)) - - return table[['start_time', 'length', 'label']] - -def parse_periodic_segments(day_segments): - day_segments.loc[day_segments["repeats_on"] == "every_day", "repeats_value"] = 0 - return day_segments - -def parse_event_segments(day_segments, device_ids): - return day_segments.query("device_id == @device_ids") - -def parse_day_segments(day_segments_file, segments_type, device_ids): - # Add code to validate and parse frequencies, intervals, and events - # Expected formats: - # Frequency: label, length columns (e.g. my_prefix, 5) length has to be in minutes (int) - # Interval: label, start, end columns (e.g. daily, 00:00, 23:59) start and end should be valid hours in 24 hour format - # Event: label, timestamp, length, shift (e.g., survey1, 1532313215463, 60, -30), timestamp is a UNIX timestamp in ms (we could take a date time string instead), length is in minutes (int), shift is in minutes (+/-int) and is added/substracted from timestamp - # Our output should have local_date, start_time, end_time, label. In the readable_datetime script, If local_date has the same value for all rows, every segment will be applied for all days, otherwise each segment will be applied only to its local_date - day_segments = pd.read_csv(day_segments_file) - - if day_segments is None: - message = 'The day segments file in [DAY_SEGMENTS][FILE] is None. Modify {}'.format(day_segments_file) - raise ValueError(message) - - if day_segments.shape[0] == 0: - message = 'The day segments file in [DAY_SEGMENTS][FILE] is empty. Modify {}'.format(day_segments_file) - raise ValueError(message) - - if(segments_type not in ["FREQUENCY", "PERIODIC", "EVENT"]): - raise ValueError("[DAY_SEGMENTS][TYPE] can only be FREQUENCY, PERIODIC, or EVENT") - - if(segments_type == "FREQUENCY" and is_valid_frequency_segments(day_segments, day_segments_file)): - day_segments = parse_frequency_segments(day_segments) - elif(segments_type == "PERIODIC" and is_valid_periodic_segments(day_segments, day_segments_file)): - day_segments = parse_periodic_segments(day_segments) - elif(segments_type == "EVENT" and is_valid_event_segments(day_segments, day_segments_file)): - day_segments = parse_event_segments(day_segments, device_ids) - else: - raise ValueError("{} does not have a format compatible with frequency, periodic or event day segments. Please refer to [LINK]".format(day_segments_file)) - return day_segments - -participant_file = yaml.load(open(snakemake.input[1], 'r'), Loader=yaml.FullLoader) -device_ids = [] -for key in participant_file.keys(): - if "DEVICE_IDS" in participant_file[key]: - device_ids = device_ids + participant_file[key]["DEVICE_IDS"] - -final_day_segments = parse_day_segments(snakemake.input[0], snakemake.params["day_segments_type"], device_ids) - -if snakemake.params["day_segments_type"] == "EVENT" and final_day_segments.shape[0] == 0: - warnings.warn("There are no event day segments for {}. Check your day segment file {}".format(snakemake.params["pid"], snakemake.input[0])) - -final_day_segments.to_csv(snakemake.output["segments_file"], index=False) -pd.DataFrame({"label" : final_day_segments["label"].unique()}).to_csv(snakemake.output["segments_labels_file"], index=False) \ No newline at end of file diff --git a/src/data/compute_time_segments.py b/src/data/compute_time_segments.py new file mode 100644 index 00000000..adf50958 --- /dev/null +++ b/src/data/compute_time_segments.py @@ -0,0 +1,216 @@ +import pandas as pd +import warnings +import yaml + +def is_valid_frequency_segments(time_segments, time_segments_file): + """ + returns true if time_segment has the expected structure for generating frequency segments; + raises ValueError exception otherwise. + """ + + valid_columns = ["label", "length"] + if set(time_segments.columns) != set(valid_columns): + error_message = 'The FREQUENCY time segments file in [TIME_SEGMENTS][FILE] must have two columns: label, and length ' \ + 'but instead we found {}. Modify {}'.format(list(time_segments.columns), time_segments_file) + raise ValueError(error_message) + + if time_segments.shape[0] > 1: + message = 'The FREQUENCY time segments file in [TIME_SEGMENTS][FILE] can only have 1 row.' \ + 'Modify {}'.format(time_segments_file) + raise ValueError(message) + + if not pd.api.types.is_integer_dtype(time_segments.dtypes['length']): + message = 'The column length in the FREQUENCY time segments file in [TIME_SEGMENTS][FILE] must be integer but instead is ' \ + '{}. . This usually means that not all values in this column are formed by digits. Modify {}'.format(time_segments.dtypes['length'], time_segments_file) + raise ValueError(message) + + if time_segments.iloc[0].loc['length'] < 0: + message = 'The value in column length in the FREQUENCY time segments file in [TIME_SEGMENTS][FILE] must be positive but instead is ' \ + '{}. Modify {}'.format(time_segments.iloc[0].loc['length'], time_segments_file) + raise ValueError(message) + if time_segments.iloc[0].loc['length'] >= 1440: + message = 'The column length in the FREQUENCY time segments file in [TIME_SEGMENTS][FILE] must be shorter than a day in minutes (1440) but instead is ' \ + '{}. Modify {}'.format(time_segments.iloc[0].loc['length'], time_segments_file) + raise ValueError(message) + + return True + +def is_valid_periodic_segments(time_segments, time_segments_file): + time_segments = time_segments.copy(deep=True) + + valid_columns = ["label", "start_time", "length", "repeats_on", "repeats_value"] + if set(time_segments.columns) != set(valid_columns): + error_message = 'The PERIODIC time segments file in [TIME_SEGMENTS][FILE] must have five columns: label, start_time, length, repeats_on, repeats_value ' \ + 'but instead we found {}. Modify {}'.format(list(time_segments.columns), time_segments_file) + raise ValueError(error_message) + + valid_repeats_on = ["every_day", "wday", "mday", "qday", "yday"] + if len(list(set(time_segments["repeats_on"]) - set(valid_repeats_on))) > 0: + error_message = 'The column repeats_on in the PERIODIC time segments file in [TIME_SEGMENTS][FILE] can only accept: "every_day", "wday", "mday", "qday", or "yday" ' \ + 'but instead we found {}. Modify {}'.format(list(set(time_segments["repeats_on"])), time_segments_file) + raise ValueError(error_message) + + if not pd.api.types.is_integer_dtype(time_segments.dtypes['repeats_value']): + message = 'The column repeats_value in the PERIODIC time segments file in [TIME_SEGMENTS][FILE] must be integer but instead is ' \ + '{}. . This usually means that not all values in this column are formed by digits. Modify {}'.format(time_segments.dtypes['repeats_value'], time_segments_file) + raise ValueError(message) + + invalid_time_segments = time_segments.query("repeats_on == 'every_day' and repeats_value != 0") + if invalid_time_segments.shape[0] > 0: + message = 'Every row with repeats_on=every_day must have a repeats_value=0 in the PERIODIC time segments file in [TIME_SEGMENTS][FILE].' \ + ' Modify row(s) of segment(s) {} of {}'.format(invalid_time_segments["label"].to_numpy(), time_segments_file) + raise ValueError(message) + + invalid_time_segments = time_segments.query("repeats_on == 'wday' and (repeats_value < 1 | repeats_value > 7)") + if invalid_time_segments.shape[0] > 0: + message = 'Every row with repeats_on=wday must have a repeats_value=[1,7] in the PERIODIC time segments file in [TIME_SEGMENTS][FILE].' \ + ' Modify row(s) of segment(s) {} of {}'.format(invalid_time_segments["label"].to_numpy(), time_segments_file) + raise ValueError(message) + + invalid_time_segments = time_segments.query("repeats_on == 'mday' and (repeats_value < 1 | repeats_value > 31)") + if invalid_time_segments.shape[0] > 0: + message = 'Every row with repeats_on=mday must have a repeats_value=[1,31] in the PERIODIC time segments file in [TIME_SEGMENTS][FILE].' \ + ' Modify row(s) of segment(s) {} of {}'.format(invalid_time_segments["label"].to_numpy(), time_segments_file) + raise ValueError(message) + + invalid_time_segments = time_segments.query("repeats_on == 'qday' and (repeats_value < 1 | repeats_value > 92)") + if invalid_time_segments.shape[0] > 0: + message = 'Every row with repeats_on=qday must have a repeats_value=[1,92] in the PERIODIC time segments file in [TIME_SEGMENTS][FILE].' \ + ' Modify row(s) of segment(s) {} of {}'.format(invalid_time_segments["label"].to_numpy(), time_segments_file) + raise ValueError(message) + + invalid_time_segments = time_segments.query("repeats_on == 'yday' and (repeats_value < 1 | repeats_value > 366)") + if invalid_time_segments.shape[0] > 0: + message = 'Every row with repeats_on=yday must have a repeats_value=[1,366] in the PERIODIC time segments file in [TIME_SEGMENTS][FILE].' \ + ' Modify row(s) of segment(s) {} of {}'.format(invalid_time_segments["label"].to_numpy(), time_segments_file) + raise ValueError(message) + + try: + time_segments["start_time"] = pd.to_datetime(time_segments["start_time"]) + except ValueError as err: + raise ValueError("At least one start_time in the PERIODIC time segments file in [TIME_SEGMENTS][FILE] has an invalid format, it should be HH:MM:SS in 24hr clock({}). Modify {}".format(err, time_segments_file)) + + if(time_segments.shape[0] != time_segments.drop_duplicates().shape[0]): + error_message = 'The PERIODIC time segments file in [TIME_SEGMENTS][FILE] has two or more rows that are identical. ' \ + 'Modify {}'.format(time_segments_file) + raise ValueError(error_message) + + duplicated_labels = time_segments[time_segments["label"].duplicated()] + if(duplicated_labels.shape[0] > 0): + error_message = 'Segements labels must be unique. The PERIODIC time segments file in [TIME_SEGMENTS][FILE] has {} row(s) with the same label {}. ' \ + 'Modify {}'.format(duplicated_labels.shape[0], duplicated_labels["label"].to_numpy(), time_segments_file) + raise ValueError(error_message) + + # TODO Validate string format for lubridate + + return True + +def is_valid_event_segments(time_segments, time_segments_file): + time_segments = time_segments.copy(deep=True) + + valid_columns = ["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"] + if set(time_segments.columns) != set(valid_columns): + error_message = 'The EVENT time segments file in [TIME_SEGMENTS][FILE] must have six columns: label, event_timestamp, length, shift, shift_direction and device_id ' \ + 'but instead we found {}. Modify {}'.format(list(time_segments.columns), time_segments_file) + raise ValueError(error_message) + + if not pd.api.types.is_integer_dtype(time_segments.dtypes['event_timestamp']): + message = 'The column event_timestamp in the EVENT time segments file in [TIME_SEGMENTS][FILE] must be integer but instead is ' \ + '{}. This usually means that not all values in this column are formed by digits. Modify {}'.format(time_segments.dtypes['event_timestamp'], time_segments_file) + raise ValueError(message) + + valid_shift_direction_values = [1, -1, 0] + provided_values = time_segments["shift_direction"].unique() + if len(list(set(provided_values) - set(valid_shift_direction_values))) > 0: + error_message = 'The values of shift_direction column in the EVENT time segments file in [TIME_SEGMENTS][FILE] can only be 1, -1 or 0 ' \ + 'but instead we found {}. Modify {}'.format(provided_values, time_segments_file) + raise ValueError(error_message) + + if(time_segments.shape[0] != time_segments.drop_duplicates().shape[0]): + error_message = 'The EVENT time segments file in [TIME_SEGMENTS][FILE] has two or more rows that are identical. ' \ + 'Modify {}'.format(time_segments_file) + raise ValueError(error_message) + + # TODO Validate string format for lubridate of length and shift + # TODO validate unique labels per participant + return True + + +def parse_frequency_segments(time_segments: pd.DataFrame) -> pd.DataFrame: + """ + returns a table with rows identifying start and end of time slots with frequency freq (in minutes). For example, + for freq = 10 it outputs: + bin_id start end label + 0 00:00 00:10 epoch_0000 + 1 00:10 00:20 epoch_0001 + 2 00:20 00:30 epoch_0002 + ... + 143 23:50 00:00 epoch_0143 + time_segments argument is expected to have the following structure: + label length + epoch 10 + """ + freq = time_segments.iloc[0].loc['length'] + slots = pd.date_range(start='2020-01-01', end='2020-01-02', freq='{}min'.format(freq)) + slots = ['{:02d}:{:02d}'.format(x.hour, x.minute) for x in slots] + + table = pd.DataFrame(slots, columns=['start_time']) + table['length'] = time_segments.iloc[0].loc['length'] + table = table.iloc[:-1, :] + + label = time_segments.loc[0, 'label'] + table['label'] = range(0, table.shape[0]) + table['label'] = table['label'].apply(lambda x: '{}{:04}'.format(label, x)) + + return table[['start_time', 'length', 'label']] + +def parse_periodic_segments(time_segments): + time_segments.loc[time_segments["repeats_on"] == "every_day", "repeats_value"] = 0 + return time_segments + +def parse_event_segments(time_segments, device_ids): + return time_segments.query("device_id == @device_ids") + +def parse_time_segments(time_segments_file, segments_type, device_ids): + # Add code to validate and parse frequencies, intervals, and events + # Expected formats: + # Frequency: label, length columns (e.g. my_prefix, 5) length has to be in minutes (int) + # Interval: label, start, end columns (e.g. daily, 00:00, 23:59) start and end should be valid hours in 24 hour format + # Event: label, timestamp, length, shift (e.g., survey1, 1532313215463, 60, -30), timestamp is a UNIX timestamp in ms (we could take a date time string instead), length is in minutes (int), shift is in minutes (+/-int) and is added/substracted from timestamp + # Our output should have local_date, start_time, end_time, label. In the readable_datetime script, If local_date has the same value for all rows, every segment will be applied for all days, otherwise each segment will be applied only to its local_date + time_segments = pd.read_csv(time_segments_file) + + if time_segments is None: + message = 'The time segments file in [TIME_SEGMENTS][FILE] is None. Modify {}'.format(time_segments_file) + raise ValueError(message) + + if time_segments.shape[0] == 0: + message = 'The time segments file in [TIME_SEGMENTS][FILE] is empty. Modify {}'.format(time_segments_file) + raise ValueError(message) + + if(segments_type not in ["FREQUENCY", "PERIODIC", "EVENT"]): + raise ValueError("[TIME_SEGMENTS][TYPE] can only be FREQUENCY, PERIODIC, or EVENT") + + if(segments_type == "FREQUENCY" and is_valid_frequency_segments(time_segments, time_segments_file)): + time_segments = parse_frequency_segments(time_segments) + elif(segments_type == "PERIODIC" and is_valid_periodic_segments(time_segments, time_segments_file)): + time_segments = parse_periodic_segments(time_segments) + elif(segments_type == "EVENT" and is_valid_event_segments(time_segments, time_segments_file)): + time_segments = parse_event_segments(time_segments, device_ids) + else: + raise ValueError("{} does not have a format compatible with frequency, periodic or event time segments. Please refer to [LINK]".format(time_segments_file)) + return time_segments + +participant_file = yaml.load(open(snakemake.input[1], 'r'), Loader=yaml.FullLoader) +device_ids = [] +for key in participant_file.keys(): + if "DEVICE_IDS" in participant_file[key]: + device_ids = device_ids + participant_file[key]["DEVICE_IDS"] + +final_time_segments = parse_time_segments(snakemake.input[0], snakemake.params["time_segments_type"], device_ids) + +if snakemake.params["time_segments_type"] == "EVENT" and final_time_segments.shape[0] == 0: + warnings.warn("There are no event time segments for {}. Check your time segment file {}".format(snakemake.params["pid"], snakemake.input[0])) + +final_time_segments.to_csv(snakemake.output["segments_file"], index=False) +pd.DataFrame({"label" : final_time_segments["label"].unique()}).to_csv(snakemake.output["segments_labels_file"], index=False) \ No newline at end of file diff --git a/src/data/phone_sensed_bins.R b/src/data/phone_sensed_bins.R deleted file mode 100644 index cc6dc791..00000000 --- a/src/data/phone_sensed_bins.R +++ /dev/null @@ -1,40 +0,0 @@ -source("renv/activate.R") - -library("dplyr", warn.conflicts = F) -library(tidyr) -library(lubridate) - -all_sensors <- snakemake@input[["all_sensors"]] -bin_size <- snakemake@params[["bin_size"]] -output_file <- snakemake@output[[1]] - -# Load all sensors and extract timestamps -all_sensor_data <- data.frame(timestamp = c()) -for(sensor in all_sensors){ - sensor_data <- read.csv(sensor, stringsAsFactors = F) %>% - select(local_date, local_hour, local_minute) %>% - mutate(sensor = basename(sensor)) - all_sensor_data <- rbind(all_sensor_data, sensor_data) -} - -if(nrow(all_sensor_data) == 0){ - bins = seq(0, 59, by = bin_size) - hours = seq(0, 23, 1) - write.csv(crossing(hours, bins) %>% unite("hour_bin",hours, bins, sep = "_") %>% mutate(value = NA, local_date = NA) %>% pivot_wider(names_from = hour_bin, values_from=value) %>% head(0), output_file, row.names = FALSE) -} else{ - phone_sensed_bins <- all_sensor_data %>% - mutate(bin = (local_minute %/% bin_size) * bin_size) %>% # bin rows into bin_size-minute bins - group_by(local_date, local_hour, bin) %>% - summarise(sensor_count = n_distinct(sensor)) %>% - ungroup() %>% - mutate(local_date = lubridate::ymd(local_date)) %>% - complete(local_date = seq.Date(min(local_date), max(local_date), by="day"), - fill = list(local_hour = 0, bin = 0, sensor_count = 0)) %>% - complete(nesting(local_date), - local_hour = seq(0, 23, 1), - bin = seq(0, 59, bin_size), - fill = list(sensor_count=0)) %>% - pivot_wider(names_from = c(local_hour, bin), values_from = sensor_count) - - write.csv(phone_sensed_bins, output_file, row.names = FALSE) -} diff --git a/src/data/phone_valid_sensed_days.R b/src/data/phone_valid_sensed_days.R deleted file mode 100644 index 6390f82f..00000000 --- a/src/data/phone_valid_sensed_days.R +++ /dev/null @@ -1,18 +0,0 @@ -source("renv/activate.R") -library("dplyr", warn.conflicts = F) -library("tidyr") - -phone_sensed_bins <- read.csv(snakemake@input[["phone_sensed_bins"]]) -min_valid_hours_per_day <- as.integer(snakemake@params[["min_valid_hours_per_day"]]) -min_valid_bins_per_hour <- as.integer(snakemake@params[["min_valid_bins_per_hour"]]) -output_file <- snakemake@output[[1]] - -phone_valid_sensed_days <- phone_sensed_bins %>% - pivot_longer(cols = -local_date, names_to = c("hour", "bin"), names_sep = "_") %>% - group_by(local_date, hour) %>% - summarise(valid_bins = sum(value > 0)) %>% - group_by(local_date) %>% - summarise(valid_sensed_hours = sum(valid_bins >= min_valid_bins_per_hour)) %>% - mutate(is_valid_sensed_day = ifelse(valid_sensed_hours >= min_valid_hours_per_day, TRUE, FALSE)) - -write.csv(phone_valid_sensed_days, output_file, row.names = FALSE) diff --git a/src/data/readable_datetime.R b/src/data/readable_datetime.R index e4f3b54a..10a78209 100644 --- a/src/data/readable_datetime.R +++ b/src/data/readable_datetime.R @@ -2,17 +2,17 @@ source("renv/activate.R") library("tidyverse") library("readr") -source("src/data/assign_to_day_segment.R") +source("src/data/assign_to_time_segment.R") input <- read.csv(snakemake@input[["sensor_input"]]) %>% arrange(timestamp) -day_segments <- read.csv(snakemake@input[["day_segments"]]) -day_segments_type <- snakemake@params[["day_segments_type"]] +time_segments <- read.csv(snakemake@input[["time_segments"]]) +time_segments_type <- snakemake@params[["time_segments_type"]] sensor_output <- snakemake@output[[1]] timezone_periods <- snakemake@params[["timezone_periods"]] fixed_timezone <- snakemake@params[["fixed_timezone"]] include_past_periodic_segments <- snakemake@params[["include_past_periodic_segments"]] -split_local_date_time <- function(data, day_segments){ +split_local_date_time <- function(data, time_segments){ split_data <- data %>% separate(local_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>% separate(local_time, c("local_hour", "local_minute"), ":", remove = FALSE, extra = "drop") %>% @@ -34,16 +34,16 @@ if(!is.null(timezone_periods)){ # rowwise() %>% # mutate(utc_date_time = as.POSIXct(timestamp/1000, origin="1970-01-01", tz="UTC"), # local_date_time = format(utc_date_time, tz = timezone, usetz = T, "%Y-%m-%d %H:%M:%S")) - # output <- split_local_date_time(output, day_segments) - # TODO: Implement day segment assigment with support for multiple timezones - # output <- assign_to_day_segment(output, day_segments, day_segments_type, fixed_timezone) + # output <- split_local_date_time(output, time_segments) + # TODO: Implement time segment assigment with support for multiple timezones + # output <- assign_to_time_segment(output, time_segments, time_segments_type, fixed_timezone) # write.csv(output, sensor_output) } else if(!is.null(fixed_timezone)){ output <- input %>% mutate(utc_date_time = as.POSIXct(timestamp/1000, origin="1970-01-01", tz="UTC"), local_timezone = fixed_timezone, local_date_time = format(utc_date_time, tz = fixed_timezone, "%Y-%m-%d %H:%M:%S")) - output <- split_local_date_time(output, day_segments) - output <- assign_to_day_segment(output, day_segments, day_segments_type, include_past_periodic_segments) + output <- split_local_date_time(output, time_segments) + output <- assign_to_time_segment(output, time_segments, time_segments_type, include_past_periodic_segments) write_csv(output, sensor_output) } diff --git a/src/features/entry.R b/src/features/entry.R index c4220924..51131c0f 100644 --- a/src/features/entry.R +++ b/src/features/entry.R @@ -4,13 +4,13 @@ library("dplyr",warn.conflicts = F) library("tidyr") sensor_data_files <- snakemake@input -sensor_data_files$day_segments_labels <- NULL -day_segments_file <- snakemake@input[["day_segments_labels"]] +sensor_data_files$time_segments_labels <- NULL +time_segments_file <- snakemake@input[["time_segments_labels"]] provider <- snakemake@params["provider"][["provider"]] provider_key <- snakemake@params["provider_key"] sensor_key <- snakemake@params["sensor_key"] -sensor_features <- fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, day_segments_file) +sensor_features <- fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file) write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE) \ No newline at end of file diff --git a/src/features/entry.py b/src/features/entry.py index c76c0c4f..e05ee9d6 100644 --- a/src/features/entry.py +++ b/src/features/entry.py @@ -2,13 +2,13 @@ import pandas as pd from utils.utils import fetch_provider_features sensor_data_files = dict(snakemake.input) -del sensor_data_files["day_segments_labels"] -day_segments_file = snakemake.input["day_segments_labels"] +del sensor_data_files["time_segments_labels"] +time_segments_file = snakemake.input["time_segments_labels"] provider = snakemake.params["provider"] provider_key = snakemake.params["provider_key"] sensor_key = snakemake.params["sensor_key"] -sensor_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, day_segments_file) +sensor_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file) sensor_features.to_csv(snakemake.output[0], index=False) \ No newline at end of file diff --git a/src/features/fitbit_heartrate_intraday/rapids/main.py b/src/features/fitbit_heartrate_intraday/rapids/main.py index 5224e0df..1a853722 100644 --- a/src/features/fitbit_heartrate_intraday/rapids/main.py +++ b/src/features/fitbit_heartrate_intraday/rapids/main.py @@ -41,11 +41,11 @@ def statsFeatures(heartrate_data, features, features_type, heartrate_features): return heartrate_features -def extractHRFeaturesFromIntradayData(heartrate_intraday_data, features, day_segment, filter_data_by_segment): +def extractHRFeaturesFromIntradayData(heartrate_intraday_data, features, time_segment, filter_data_by_segment): heartrate_intraday_features = pd.DataFrame(columns=["local_segment"] + features) if not heartrate_intraday_data.empty: num_rows_per_minute = heartrate_intraday_data.groupby(["local_date", "local_hour", "local_minute"]).count().mean()["device_id"] - heartrate_intraday_data = filter_data_by_segment(heartrate_intraday_data, day_segment) + heartrate_intraday_data = filter_data_by_segment(heartrate_intraday_data, time_segment) if not heartrate_intraday_data.empty: heartrate_intraday_features = pd.DataFrame() @@ -63,7 +63,7 @@ def extractHRFeaturesFromIntradayData(heartrate_intraday_data, features, day_seg return heartrate_intraday_features -def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): +def rapids_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): heartrate_intraday_data = pd.read_csv(sensor_data_files["sensor_data"]) @@ -74,6 +74,6 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg intraday_features_to_compute = list(set(requested_intraday_features) & set(base_intraday_features_names)) # extract features from intraday data - heartrate_intraday_features = extractHRFeaturesFromIntradayData(heartrate_intraday_data, intraday_features_to_compute, day_segment, filter_data_by_segment) + heartrate_intraday_features = extractHRFeaturesFromIntradayData(heartrate_intraday_data, intraday_features_to_compute, time_segment, filter_data_by_segment) return heartrate_intraday_features diff --git a/src/features/fitbit_heartrate_summary/rapids/main.py b/src/features/fitbit_heartrate_summary/rapids/main.py index b07f6ec3..e236c38f 100644 --- a/src/features/fitbit_heartrate_summary/rapids/main.py +++ b/src/features/fitbit_heartrate_summary/rapids/main.py @@ -59,7 +59,7 @@ def extractHRFeaturesFromSummaryData(heartrate_summary_data, summary_features): return heartrate_summary_features -def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): +def rapids_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): heartrate_summary_data = pd.read_csv(sensor_data_files["sensor_data"]) @@ -72,14 +72,14 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg # extract features from summary data heartrate_summary_features = pd.DataFrame(columns=["local_segment"] + summary_features_to_compute) if not heartrate_summary_data.empty: - heartrate_summary_data = filter_data_by_segment(heartrate_summary_data, day_segment) + heartrate_summary_data = filter_data_by_segment(heartrate_summary_data, time_segment) if not heartrate_summary_data.empty: # only keep the segments start at 00:00:00 and end at 23:59:59 datetime_start_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 00:00:00" datetime_end_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 23:59:59" - segment_regex = "{}#{},{}".format(day_segment, datetime_start_regex, datetime_end_regex) + segment_regex = "{}#{},{}".format(time_segment, datetime_start_regex, datetime_end_regex) heartrate_summary_data = heartrate_summary_data[heartrate_summary_data["local_segment"].str.match(segment_regex)] if not heartrate_summary_data.empty: diff --git a/src/features/fitbit_sleep_summary/rapids/main.py b/src/features/fitbit_sleep_summary/rapids/main.py index 46ce2052..048baa20 100644 --- a/src/features/fitbit_sleep_summary/rapids/main.py +++ b/src/features/fitbit_sleep_summary/rapids/main.py @@ -47,7 +47,7 @@ def extractSleepFeaturesFromSummaryData(sleep_summary_data, summary_features, sl return sleep_summary_features -def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): +def rapids_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): sleep_summary_data = pd.read_csv(sensor_data_files["sensor_data"]) @@ -68,14 +68,14 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg # extract features from summary data sleep_summary_features = pd.DataFrame(columns=["local_segment"] + features_fullnames_to_compute) if not sleep_summary_data.empty: - sleep_summary_data = filter_data_by_segment(sleep_summary_data, day_segment) + sleep_summary_data = filter_data_by_segment(sleep_summary_data, time_segment) if not sleep_summary_data.empty: # only keep the segments start at 00:00:00 and end at 23:59:59 datetime_start_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 00:00:00" datetime_end_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 23:59:59" - segment_regex = "{}#{},{}".format(day_segment, datetime_start_regex, datetime_end_regex) + segment_regex = "{}#{},{}".format(time_segment, datetime_start_regex, datetime_end_regex) sleep_summary_data = sleep_summary_data[sleep_summary_data["local_segment"].str.match(segment_regex)] if not sleep_summary_data.empty: diff --git a/src/features/fitbit_steps_intraday/rapids/main.py b/src/features/fitbit_steps_intraday/rapids/main.py index 8e508d4e..ee9d0dd3 100644 --- a/src/features/fitbit_steps_intraday/rapids/main.py +++ b/src/features/fitbit_steps_intraday/rapids/main.py @@ -64,7 +64,7 @@ def extractStepsFeaturesFromIntradayData(steps_intraday_data, threshold_active_b -def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): +def rapids_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): threshold_active_bout = provider["THRESHOLD_ACTIVE_BOUT"] include_zero_step_rows = provider["INCLUDE_ZERO_STEP_ROWS"] @@ -90,7 +90,7 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg # extract features from intraday features steps_intraday_features = pd.DataFrame(columns=["local_segment"] + intraday_features_to_compute) if not steps_intraday_data.empty: - steps_intraday_data = filter_data_by_segment(steps_intraday_data, day_segment) + steps_intraday_data = filter_data_by_segment(steps_intraday_data, time_segment) if not steps_intraday_data.empty: steps_intraday_features = extractStepsFeaturesFromIntradayData(steps_intraday_data, threshold_active_bout, intraday_features_to_compute_steps, intraday_features_to_compute_sedentarybout, intraday_features_to_compute_activebout, steps_intraday_features) diff --git a/src/features/fitbit_steps_summary/rapids/main.py b/src/features/fitbit_steps_summary/rapids/main.py index e49a895f..5db8bc52 100644 --- a/src/features/fitbit_steps_summary/rapids/main.py +++ b/src/features/fitbit_steps_summary/rapids/main.py @@ -38,7 +38,7 @@ def extractStepsFeaturesFromSummaryData(steps_summary_data, summary_features_to_ -def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): +def rapids_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): steps_summary_data = pd.read_csv(sensor_data_files["sensor_data"]) requested_summary_features = provider["FEATURES"] @@ -51,14 +51,14 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg # extract features from summary data steps_summary_features = pd.DataFrame(columns=["local_segment"] + summary_features_to_compute) if not steps_summary_data.empty: - steps_summary_data = filter_data_by_segment(steps_summary_data, day_segment) + steps_summary_data = filter_data_by_segment(steps_summary_data, time_segment) if not steps_summary_data.empty: # only keep the segments start at 00:00:00 and end at 23:59:59 datetime_start_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 00:00:00" datetime_end_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 23:59:59" - segment_regex = "{}#{},{}".format(day_segment, datetime_start_regex, datetime_end_regex) + segment_regex = "{}#{},{}".format(time_segment, datetime_start_regex, datetime_end_regex) steps_summary_data = steps_summary_data[steps_summary_data["local_segment"].str.match(segment_regex)] if not steps_summary_data.empty: diff --git a/src/features/phone_accelerometer/panda/main.py b/src/features/phone_accelerometer/panda/main.py index 1b1139df..be52a8b6 100644 --- a/src/features/phone_accelerometer/panda/main.py +++ b/src/features/phone_accelerometer/panda/main.py @@ -42,7 +42,7 @@ def statsFeatures(acc_data, features_to_compute, features_type, acc_features): -def panda_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): +def panda_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): acc_data = pd.read_csv(sensor_data_files["sensor_data"]) requested_features = provider["FEATURES"] @@ -58,7 +58,7 @@ def panda_features(sensor_data_files, day_segment, provider, filter_data_by_segm acc_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) if not acc_data.empty: - acc_data = filter_data_by_segment(acc_data, day_segment) + acc_data = filter_data_by_segment(acc_data, time_segment) if not acc_data.empty: acc_features = pd.DataFrame() diff --git a/src/features/phone_accelerometer/rapids/main.py b/src/features/phone_accelerometer/rapids/main.py index dc197066..7fc10918 100644 --- a/src/features/phone_accelerometer/rapids/main.py +++ b/src/features/phone_accelerometer/rapids/main.py @@ -1,7 +1,7 @@ import pandas as pd import numpy as np -def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): +def rapids_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): acc_data = pd.read_csv(sensor_data_files["sensor_data"]) requested_features = provider["FEATURES"] @@ -12,7 +12,7 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg acc_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) if not acc_data.empty: - acc_data = filter_data_by_segment(acc_data, day_segment) + acc_data = filter_data_by_segment(acc_data, time_segment) if not acc_data.empty: acc_features = pd.DataFrame() diff --git a/src/features/phone_activity_recognition/rapids/main.py b/src/features/phone_activity_recognition/rapids/main.py index 099562fd..d504e6c5 100644 --- a/src/features/phone_activity_recognition/rapids/main.py +++ b/src/features/phone_activity_recognition/rapids/main.py @@ -1,7 +1,7 @@ import pandas as pd import numpy as np -def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): +def rapids_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): ar_episodes = pd.read_csv(sensor_data_files["sensor_episodes"]) activity_classes = provider["ACTIVITY_CLASSES"] @@ -14,7 +14,7 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg ar_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) if not ar_episodes.empty: - ar_episodes = filter_data_by_segment(ar_episodes, day_segment) + ar_episodes = filter_data_by_segment(ar_episodes, time_segment) if not ar_episodes.empty: ar_features = pd.DataFrame() diff --git a/src/features/phone_applications_foreground/rapids/main.py b/src/features/phone_applications_foreground/rapids/main.py index 305e07fc..970e5c55 100644 --- a/src/features/phone_applications_foreground/rapids/main.py +++ b/src/features/phone_applications_foreground/rapids/main.py @@ -4,7 +4,7 @@ import itertools from scipy.stats import entropy -def compute_features(filtered_data, apps_type, requested_features, apps_features, day_segment): +def compute_features(filtered_data, apps_type, requested_features, apps_features, time_segment): # There is the rare occasion that filtered_data is empty (found in testing) if "timeoffirstuse" in requested_features: time_first_event = filtered_data.sort_values(by="timestamp", ascending=True).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment") @@ -30,7 +30,7 @@ def compute_features(filtered_data, apps_type, requested_features, apps_features return apps_features -def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): +def rapids_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): apps_data = pd.read_csv(sensor_data_files["sensor_data"]) @@ -58,7 +58,7 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg # deep copy the apps_data for the top1global computation apps_data_global = apps_data.copy() - apps_data = filter_data_by_segment(apps_data, day_segment) + apps_data = filter_data_by_segment(apps_data, time_segment) if not apps_data.empty: apps_features = pd.DataFrame() @@ -66,14 +66,14 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg single_categories.sort() for sc in single_categories: if sc == "all": - apps_features = compute_features(apps_data, "all", requested_features, apps_features, day_segment) + apps_features = compute_features(apps_data, "all", requested_features, apps_features, time_segment) else: filtered_data = apps_data[apps_data["genre"].isin([sc])] - apps_features = compute_features(filtered_data, sc, requested_features, apps_features, day_segment) + apps_features = compute_features(filtered_data, sc, requested_features, apps_features, time_segment) # multiple category for mc in multiple_categories: filtered_data = apps_data[apps_data["genre"].isin(multiple_categories_with_genres[mc])] - apps_features = compute_features(filtered_data, mc, requested_features, apps_features, day_segment) + apps_features = compute_features(filtered_data, mc, requested_features, apps_features, time_segment) # single apps for app in single_apps: col_name = app @@ -83,7 +83,7 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg app = apps_with_count.iloc[0]["package_name"] col_name = "top1global" filtered_data = apps_data[apps_data["package_name"].isin([app])] - apps_features = compute_features(filtered_data, col_name, requested_features, apps_features, day_segment) + apps_features = compute_features(filtered_data, col_name, requested_features, apps_features, time_segment) apps_features = apps_features.reset_index() diff --git a/src/features/phone_battery/rapids/main.py b/src/features/phone_battery/rapids/main.py index 20702571..52b5199e 100644 --- a/src/features/phone_battery/rapids/main.py +++ b/src/features/phone_battery/rapids/main.py @@ -1,7 +1,7 @@ import pandas as pd from datetime import datetime, timedelta, time -def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): +def rapids_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): battery_data = pd.read_csv(sensor_data_files["sensor_episodes"]) @@ -13,7 +13,7 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg battery_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) if not battery_data.empty: - battery_data = filter_data_by_segment(battery_data, day_segment) + battery_data = filter_data_by_segment(battery_data, time_segment) if not battery_data.empty: diff --git a/src/features/phone_bluetooth/rapids/main.R b/src/features/phone_bluetooth/rapids/main.R index 599db220..a8aa1ace 100644 --- a/src/features/phone_bluetooth/rapids/main.R +++ b/src/features/phone_bluetooth/rapids/main.R @@ -1,8 +1,8 @@ library("dplyr", warn.conflicts = F) library(tidyr) -compute_bluetooth_feature <- function(data, feature, day_segment){ - data <- data %>% filter_data_by_segment(day_segment) +compute_bluetooth_feature <- function(data, feature, time_segment){ + data <- data %>% filter_data_by_segment(time_segment) if(feature %in% c("countscans", "uniquedevices")){ data <- data %>% group_by(local_segment) data <- switch(feature, @@ -27,7 +27,7 @@ compute_bluetooth_feature <- function(data, feature, day_segment){ } } -rapids_features <- function(sensor_data_files, day_segment, provider){ +rapids_features <- function(sensor_data_files, time_segment, provider){ bluetooth_data <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE) requested_features <- provider[["FEATURES"]] @@ -42,7 +42,7 @@ rapids_features <- function(sensor_data_files, day_segment, provider){ features_to_compute <- intersect(base_features_names, requested_features) for(feature_name in features_to_compute){ - feature <- compute_bluetooth_feature(bluetooth_data, feature_name, day_segment) + feature <- compute_bluetooth_feature(bluetooth_data, feature_name, time_segment) features <- merge(features, feature, by="local_segment", all = TRUE) } diff --git a/src/features/phone_calls/rapids/main.R b/src/features/phone_calls/rapids/main.R index 1bd1c897..39cdfc45 100644 --- a/src/features/phone_calls/rapids/main.R +++ b/src/features/phone_calls/rapids/main.R @@ -7,7 +7,7 @@ Mode <- function(v) { uniqv[which.max(tabulate(match(v, uniqv)))] } -call_features_of_type <- function(calls, call_type, day_segment, requested_features){ +call_features_of_type <- function(calls, call_type, time_segment, requested_features){ # Output dataframe features = data.frame(local_segment = character(), stringsAsFactors = FALSE) @@ -59,14 +59,14 @@ call_features_of_type <- function(calls, call_type, day_segment, requested_featu return(features) } -rapids_features <- function(sensor_data_files, day_segment, provider){ +rapids_features <- function(sensor_data_files, time_segment, provider){ calls_data <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE) - calls_data <- calls_data %>% filter_data_by_segment(day_segment) + calls_data <- calls_data %>% filter_data_by_segment(time_segment) call_types = provider[["CALL_TYPES"]] call_features <- setNames(data.frame(matrix(ncol=1, nrow=0)), c("local_segment")) for(call_type in call_types){ - # Filter rows that belong to the calls type and day segment of interest + # Filter rows that belong to the calls type and time segment of interest call_type_label = ifelse(call_type == "incoming", "1", ifelse(call_type == "outgoing", "2", ifelse(call_type == "missed", "3", NA))) if(is.na(call_type_label)) stop(paste("Call type can online be incoming, outgoing or missed but instead you typed: ", call_type, " in config[CALLS][CALL_TYPES]")) @@ -74,7 +74,7 @@ rapids_features <- function(sensor_data_files, day_segment, provider){ requested_features <- provider[["FEATURES"]][[call_type]] calls_of_type <- calls_data %>% filter(call_type == call_type_label) - features <- call_features_of_type(calls_of_type, call_type, day_segment, requested_features) + features <- call_features_of_type(calls_of_type, call_type, time_segment, requested_features) call_features <- merge(call_features, features, all=TRUE) } call_features <- call_features %>% mutate_at(vars(contains("countmostfrequentcontact") | contains("distinctcontacts") | contains("count")), list( ~ replace_na(., 0))) diff --git a/src/features/phone_conversation/rapids/main.py b/src/features/phone_conversation/rapids/main.py index f04e9c68..902cad69 100644 --- a/src/features/phone_conversation/rapids/main.py +++ b/src/features/phone_conversation/rapids/main.py @@ -1,7 +1,7 @@ import pandas as pd import numpy as np -def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): +def rapids_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): conversation_data = pd.read_csv(sensor_data_files["sensor_data"]) @@ -23,7 +23,7 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg conversation_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) if not conversation_data.empty: - conversation_data = filter_data_by_segment(conversation_data, day_segment) + conversation_data = filter_data_by_segment(conversation_data, time_segment) if not conversation_data.empty: conversation_features = pd.DataFrame() diff --git a/src/features/phone_data_yield/rapids/main.R b/src/features/phone_data_yield/rapids/main.R index b51fd8fa..57473e98 100644 --- a/src/features/phone_data_yield/rapids/main.R +++ b/src/features/phone_data_yield/rapids/main.R @@ -2,8 +2,8 @@ library("dplyr", warn.conflicts = F) library(tidyr) library(readr) -compute_data_yield_features <- function(data, feature_name, day_segment, provider){ - data <- data %>% filter_data_by_segment(day_segment) +compute_data_yield_features <- function(data, feature_name, time_segment, provider){ + data <- data %>% filter_data_by_segment(time_segment) features <- data %>% separate(timestamps_segment, into = c("start_timestamp", "end_timestamp"), convert = T, sep = ",") %>% mutate(duration_minutes = (end_timestamp - start_timestamp) / 60000, @@ -26,7 +26,7 @@ compute_data_yield_features <- function(data, feature_name, day_segment, provide -rapids_features <- function(sensor_data_files, day_segment, provider){ +rapids_features <- function(sensor_data_files, time_segment, provider){ yield_data <- read_csv(sensor_data_files[["sensor_data"]], col_types = cols_only(timestamp ="d", assigned_segments = "c")) requested_features <- provider[["FEATURES"]] @@ -40,7 +40,7 @@ rapids_features <- function(sensor_data_files, day_segment, provider){ # The subset of requested features this function can compute features_to_compute <- intersect(base_features_names, requested_features) - features <- compute_data_yield_features(yield_data, feature_name, day_segment, provider) %>% + features <- compute_data_yield_features(yield_data, feature_name, time_segment, provider) %>% select(c("local_segment", features_to_compute)) return(features) diff --git a/src/features/phone_light/rapids/main.py b/src/features/phone_light/rapids/main.py index 9231a2d9..32df47ef 100644 --- a/src/features/phone_light/rapids/main.py +++ b/src/features/phone_light/rapids/main.py @@ -1,7 +1,7 @@ import pandas as pd import numpy as np -def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): +def rapids_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): light_data = pd.read_csv(sensor_data_files["sensor_data"]) requested_features = provider["FEATURES"] @@ -12,7 +12,7 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg light_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) if not light_data.empty: - light_data = filter_data_by_segment(light_data, day_segment) + light_data = filter_data_by_segment(light_data, time_segment) if not light_data.empty: light_features = pd.DataFrame() diff --git a/src/features/phone_locations/barnett/main.R b/src/features/phone_locations/barnett/main.R index b4454464..b305be20 100644 --- a/src/features/phone_locations/barnett/main.R +++ b/src/features/phone_locations/barnett/main.R @@ -27,7 +27,7 @@ create_empty_file <- function(requested_features){ ) %>% select(all_of(requested_features))) } -barnett_features <- function(sensor_data_files, day_segment, params){ +barnett_features <- function(sensor_data_files, time_segment, params){ location_data <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE) location_features <- NULL @@ -51,14 +51,14 @@ barnett_features <- function(sensor_data_files, day_segment, params){ if (nrow(location) > 1){ # Filter by segment and skipping any non-daily segment - location <- location %>% filter_data_by_segment(day_segment) + location <- location %>% filter_data_by_segment(time_segment) datetime_start_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 00:00:00" datetime_end_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 23:59:59" - location <- location %>% mutate(is_daily = str_detect(local_segment, paste0(day_segment, "#", datetime_start_regex, ",", datetime_end_regex))) + location <- location %>% mutate(is_daily = str_detect(local_segment, paste0(time_segment, "#", datetime_start_regex, ",", datetime_end_regex))) if(!all(location$is_daily)){ - message(paste("Barnett's location features cannot be computed for day segmentes that are not daily (cover 00:00:00 to 23:59:59 of every day). Skipping ", day_segment)) + message(paste("Barnett's location features cannot be computed for time segmentes that are not daily (cover 00:00:00 to 23:59:59 of every day). Skipping ", time_segment)) location_features <- create_empty_file(requested_features) } else { # Count how many minutes of data we use to get location features @@ -70,7 +70,7 @@ barnett_features <- function(sensor_data_files, day_segment, params){ summarise(minutes_data_used = sum(n_minutes)) %>% select(local_date, minutes_data_used) - # Save day segment to attach it later + # Save time segment to attach it later location_dates_segments <- location %>% select(local_date, local_segment) %>% distinct(local_date, .keep_all = TRUE) # Select only the columns that the algorithm needs @@ -92,7 +92,7 @@ barnett_features <- function(sensor_data_files, day_segment, params){ colnames(features)=c("local_date",tolower(colnames(outputMobility$featavg))) # Add the minute count column features <- left_join(features, location_minutes_used, by = "local_date") - # Add the day segment column for consistency + # Add the time segment column for consistency features <- left_join(features, location_dates_segments, by = "local_date") location_features <- features %>% select(all_of(requested_features)) } diff --git a/src/features/phone_locations/doryab/main.py b/src/features/phone_locations/doryab/main.py index 4164811c..a29efc37 100644 --- a/src/features/phone_locations/doryab/main.py +++ b/src/features/phone_locations/doryab/main.py @@ -4,7 +4,7 @@ from astropy.timeseries import LombScargle from sklearn.cluster import DBSCAN from math import radians, cos, sin, asin, sqrt -def doryab_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): +def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): location_data = pd.read_csv(sensor_data_files["sensor_data"]) requested_features = provider["FEATURES"] @@ -28,7 +28,7 @@ def doryab_features(sensor_data_files, day_segment, provider, filter_data_by_seg if location_data.empty: location_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) else: - location_data = filter_data_by_segment(location_data, day_segment) + location_data = filter_data_by_segment(location_data, time_segment) if location_data.empty: location_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) @@ -47,7 +47,7 @@ def doryab_features(sensor_data_files, day_segment, provider, filter_data_by_seg location_data = location_data[(location_data['double_latitude']!=0.0) & (location_data['double_longitude']!=0.0)] if location_data.empty: - location_features = pd.DataFrame(columns=["local_date"] + ["location_" + day_segment + "_" + x for x in features_to_compute]) + location_features = pd.DataFrame(columns=["local_date"] + ["location_" + time_segment + "_" + x for x in features_to_compute]) location_features = location_features.reset_index(drop=True) return location_features diff --git a/src/features/phone_messages/rapids/main.R b/src/features/phone_messages/rapids/main.R index ac566016..b92769fd 100644 --- a/src/features/phone_messages/rapids/main.R +++ b/src/features/phone_messages/rapids/main.R @@ -1,7 +1,7 @@ library('tidyr') library('stringr') -message_features_of_type <- function(messages, messages_type, day_segment, requested_features){ +message_features_of_type <- function(messages, messages_type, time_segment, requested_features){ # Output dataframe features = data.frame(local_segment = character(), stringsAsFactors = FALSE) @@ -47,14 +47,14 @@ message_features_of_type <- function(messages, messages_type, day_segment, reque return(features) } -rapids_features <- function(sensor_data_files, day_segment, provider){ +rapids_features <- function(sensor_data_files, time_segment, provider){ messages_data <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE) - messages_data <- messages_data %>% filter_data_by_segment(day_segment) + messages_data <- messages_data %>% filter_data_by_segment(time_segment) messages_types = provider[["MESSAGES_TYPES"]] messages_features <- setNames(data.frame(matrix(ncol=1, nrow=0)), c("local_segment")) for(message_type in messages_types){ - # Filter rows that belong to the message type and day segment of interest + # Filter rows that belong to the message type and time segment of interest message_type_label = ifelse(message_type == "received", "1", ifelse(message_type == "sent", "2", NA)) if(is.na(message_type_label)) stop(paste("Message type can online be received or sent but instead you typed: ", message_type, " in config[PHONE_MESSAGES][MESSAGES_TYPES]")) @@ -62,7 +62,7 @@ rapids_features <- function(sensor_data_files, day_segment, provider){ requested_features <- provider[["FEATURES"]][[message_type]] messages_of_type <- messages_data %>% filter(message_type == message_type_label) - features <- message_features_of_type(messages_of_type, message_type, day_segment, requested_features) + features <- message_features_of_type(messages_of_type, message_type, time_segment, requested_features) messages_features <- merge(messages_features, features, all=TRUE) } messages_features <- messages_features %>% mutate_at(vars(contains("countmostfrequentcontact") | contains("distinctcontacts") | contains("count")), list( ~ replace_na(., 0))) diff --git a/src/features/phone_screen/rapids/main.py b/src/features/phone_screen/rapids/main.py index fe49a53c..9a99342c 100644 --- a/src/features/phone_screen/rapids/main.py +++ b/src/features/phone_screen/rapids/main.py @@ -1,7 +1,7 @@ import pandas as pd import itertools -def getEpisodeDurationFeatures(screen_data, day_segment, episode, features, reference_hour_first_use): +def getEpisodeDurationFeatures(screen_data, time_segment, episode, features, reference_hour_first_use): screen_data_episode = screen_data[screen_data["episode"] == episode] duration_helper = pd.DataFrame() if "countepisode" in features: @@ -25,7 +25,7 @@ def getEpisodeDurationFeatures(screen_data, day_segment, episode, features, refe return duration_helper -def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): +def rapids_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): screen_data = pd.read_csv(sensor_data_files["sensor_episodes"]) @@ -48,7 +48,7 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg screen_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) if not screen_data.empty: - screen_data = filter_data_by_segment(screen_data, day_segment) + screen_data = filter_data_by_segment(screen_data, time_segment) if not screen_data.empty: if ignore_episodes_shorter_than > 0: @@ -59,7 +59,7 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg if not screen_data.empty: screen_features = pd.DataFrame() for episode in episode_type_to_compute: - screen_features = pd.concat([screen_features, getEpisodeDurationFeatures(screen_data, day_segment, episode, features_episodes_to_compute, reference_hour_first_use)], axis=1) + screen_features = pd.concat([screen_features, getEpisodeDurationFeatures(screen_data, time_segment, episode, features_episodes_to_compute, reference_hour_first_use)], axis=1) if not screen_features.empty: screen_features = screen_features.reset_index() diff --git a/src/features/phone_wifi_connected/rapids/main.R b/src/features/phone_wifi_connected/rapids/main.R index 84513249..7ab44c6e 100644 --- a/src/features/phone_wifi_connected/rapids/main.R +++ b/src/features/phone_wifi_connected/rapids/main.R @@ -1,7 +1,7 @@ library("dplyr", warn.conflicts = F) -compute_wifi_feature <- function(data, feature, day_segment){ - data <- data %>% filter_data_by_segment(day_segment) +compute_wifi_feature <- function(data, feature, time_segment){ + data <- data %>% filter_data_by_segment(time_segment) if(feature %in% c("countscans", "uniquedevices")){ data <- data %>% group_by(local_segment) data <- switch(feature, @@ -25,7 +25,7 @@ compute_wifi_feature <- function(data, feature, day_segment){ } } -rapids_features <- function(sensor_data_files, day_segment, provider){ +rapids_features <- function(sensor_data_files, time_segment, provider){ wifi_data <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE) requested_features <- provider[["FEATURES"]] # Output dataframe @@ -38,7 +38,7 @@ rapids_features <- function(sensor_data_files, day_segment, provider){ features_to_compute <- intersect(base_features_names, requested_features) for(feature_name in features_to_compute){ - feature <- compute_wifi_feature(wifi_data, feature_name, day_segment) + feature <- compute_wifi_feature(wifi_data, feature_name, time_segment) features <- merge(features, feature, by="local_segment", all = TRUE) } diff --git a/src/features/phone_wifi_visible/rapids/main.R b/src/features/phone_wifi_visible/rapids/main.R index 84513249..7ab44c6e 100644 --- a/src/features/phone_wifi_visible/rapids/main.R +++ b/src/features/phone_wifi_visible/rapids/main.R @@ -1,7 +1,7 @@ library("dplyr", warn.conflicts = F) -compute_wifi_feature <- function(data, feature, day_segment){ - data <- data %>% filter_data_by_segment(day_segment) +compute_wifi_feature <- function(data, feature, time_segment){ + data <- data %>% filter_data_by_segment(time_segment) if(feature %in% c("countscans", "uniquedevices")){ data <- data %>% group_by(local_segment) data <- switch(feature, @@ -25,7 +25,7 @@ compute_wifi_feature <- function(data, feature, day_segment){ } } -rapids_features <- function(sensor_data_files, day_segment, provider){ +rapids_features <- function(sensor_data_files, time_segment, provider){ wifi_data <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE) requested_features <- provider[["FEATURES"]] # Output dataframe @@ -38,7 +38,7 @@ rapids_features <- function(sensor_data_files, day_segment, provider){ features_to_compute <- intersect(base_features_names, requested_features) for(feature_name in features_to_compute){ - feature <- compute_wifi_feature(wifi_data, feature_name, day_segment) + feature <- compute_wifi_feature(wifi_data, feature_name, time_segment) features <- merge(features, feature, by="local_segment", all = TRUE) } diff --git a/src/features/utils/utils.R b/src/features/utils/utils.R index 3a03bafb..11d25557 100644 --- a/src/features/utils/utils.R +++ b/src/features/utils/utils.R @@ -2,14 +2,14 @@ library("stringr") rapids_log_tag <- "RAPIDS:" -filter_data_by_segment <- function(data, day_segment){ - # Filter the rows that belong to day_segment, and put the segment full name in a new column for grouping +filter_data_by_segment <- function(data, time_segment){ + # Filter the rows that belong to time_segment, and put the segment full name in a new column for grouping datetime_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}" timestamp_regex = "[0-9]{13}" data <- data %>% - filter(grepl(paste0("\\[", day_segment, "#"), assigned_segments)) %>% - mutate(local_segment = str_extract(assigned_segments, paste0("\\[", day_segment, "#", datetime_regex, ",", datetime_regex, ";", timestamp_regex, ",", timestamp_regex, "\\]"))) %>% - extract(local_segment, into = c("local_segment", "timestamps_segment"), paste0("\\[(", day_segment, "#", datetime_regex, ",", datetime_regex, ");(", timestamp_regex, ",", timestamp_regex, ")\\]")) %>% + filter(grepl(paste0("\\[", time_segment, "#"), assigned_segments)) %>% + mutate(local_segment = str_extract(assigned_segments, paste0("\\[", time_segment, "#", datetime_regex, ",", datetime_regex, ";", timestamp_regex, ",", timestamp_regex, "\\]"))) %>% + extract(local_segment, into = c("local_segment", "timestamps_segment"), paste0("\\[(", time_segment, "#", datetime_regex, ",", datetime_regex, ");(", timestamp_regex, ",", timestamp_regex, ")\\]")) %>% select(-assigned_segments) return(data) } @@ -44,10 +44,10 @@ chunk_episodes <- function(sensor_episodes){ return(chunked_episodes) } -fetch_provider_features <- function(provider, provider_key, sensor_key, sensor_data_files, day_segments_file){ +fetch_provider_features <- function(provider, provider_key, sensor_key, sensor_data_files, time_segments_file){ sensor_features <- data.frame(local_segment = character(), stringsAsFactors = FALSE) - day_segments_labels <- read.csv(day_segments_file, stringsAsFactors = FALSE) + time_segments_labels <- read.csv(time_segments_file, stringsAsFactors = FALSE) if(!"FEATURES" %in% names(provider)) stop(paste0("Provider config[", sensor_key,"][PROVIDERS][", provider_key,"] is missing a FEATURES attribute in config.yaml")) @@ -56,13 +56,13 @@ fetch_provider_features <- function(provider, provider_key, sensor_key, sensor_d code_path <- paste0("src/features/", sensor_key,"/", provider[["SRC_FOLDER"]], "/main.R") source(code_path) features_function <- match.fun(paste0(provider[["SRC_FOLDER"]], "_features")) - day_segments <- day_segments_labels %>% pull(label) - for (day_segment in day_segments){ - print(paste(rapids_log_tag,"Processing", sensor_key, provider_key, day_segment)) + time_segments <- time_segments_labels %>% pull(label) + for (time_segment in time_segments){ + print(paste(rapids_log_tag,"Processing", sensor_key, provider_key, time_segment)) - features <- features_function(sensor_data_files, day_segment, provider) + features <- features_function(sensor_data_files, time_segment, provider) if(!"local_segment" %in% colnames(features)) - stop(paste0("The dataframe returned by the ",sensor_key," provider '", provider_key,"' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (", code_path,")\n The 'local_segment' column is used to index a provider's features (each row corresponds to a different day segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)")) + stop(paste0("The dataframe returned by the ",sensor_key," provider '", provider_key,"' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (", code_path,")\n The 'local_segment' column is used to index a provider's features (each row corresponds to a different time segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)")) features <- features %>% rename_at(vars(!matches("local_segment")), ~ paste(sensor_key, provider_key, ., sep = "_")) sensor_features <- merge(sensor_features, features, all = TRUE) } diff --git a/src/features/utils/utils.py b/src/features/utils/utils.py index 525a2089..6938f875 100644 --- a/src/features/utils/utils.py +++ b/src/features/utils/utils.py @@ -1,13 +1,13 @@ rapids_log_tag = "RAPIDS:" -def filter_data_by_segment(data, day_segment): +def filter_data_by_segment(data, time_segment): datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}" timestamps_regex = "[0-9]{13}" - segment_regex = "\[({}#{},{};{},{})\]".format(day_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex) + segment_regex = "\[({}#{},{};{},{})\]".format(time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex) data["local_segment"] = data["assigned_segments"].str.extract(segment_regex, expand=True) data = data.drop(columns=["assigned_segments"]) data = data.dropna(subset = ["local_segment"]) - if(data.shape[0] == 0): # there are no rows belonging to day_segment + if(data.shape[0] == 0): # there are no rows belonging to time_segment data["timestamps_segment"] = None else: data[["local_segment","timestamps_segment"]] = data["local_segment"].str.split(pat =";",n=1, expand=True) @@ -69,12 +69,12 @@ def chunk_episodes(sensor_episodes): return merged_sensor_episodes -def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, day_segments_file): +def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file): import pandas as pd from importlib import import_module, util sensor_features = pd.DataFrame(columns=["local_segment"]) - day_segments_labels = pd.read_csv(day_segments_file, header=0) + time_segments_labels = pd.read_csv(time_segments_file, header=0) if "FEATURES" not in provider: raise ValueError("Provider config[{}][PROVIDERS][{}] is missing a FEATURES attribute in config.yaml".format(sensor_key.upper(), provider_key.upper())) @@ -84,11 +84,11 @@ def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_file feature_module = import_module(code_path) feature_function = getattr(feature_module, provider["SRC_FOLDER"] + "_features") - for day_segment in day_segments_labels["label"]: - print("{} Processing {} {} {}".format(rapids_log_tag, sensor_key, provider_key, day_segment)) - features = feature_function(sensor_data_files, day_segment, provider, filter_data_by_segment=filter_data_by_segment, chunk_episodes=chunk_episodes) + for time_segment in time_segments_labels["label"]: + print("{} Processing {} {} {}".format(rapids_log_tag, sensor_key, provider_key, time_segment)) + features = feature_function(sensor_data_files, time_segment, provider, filter_data_by_segment=filter_data_by_segment, chunk_episodes=chunk_episodes) if not "local_segment" in features.columns: - raise ValueError("The dataframe returned by the " + sensor_key + " provider '" + provider_key + "' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (" + code_path + ")\n The 'local_segment' column is used to index a provider's features (each row corresponds to a different day segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)") + raise ValueError("The dataframe returned by the " + sensor_key + " provider '" + provider_key + "' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (" + code_path + ")\n The 'local_segment' column is used to index a provider's features (each row corresponds to a different time segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)") features.columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features.columns] sensor_features = sensor_features.merge(features, how="outer") else: diff --git a/src/models/workflow_example/parse_targets.py b/src/models/workflow_example/parse_targets.py index a3fd1e8b..73ec542b 100644 --- a/src/models/workflow_example/parse_targets.py +++ b/src/models/workflow_example/parse_targets.py @@ -11,11 +11,11 @@ spec.loader.exec_module(mod) filter_data_by_segment = getattr(mod, "filter_data_by_segment") targets = pd.read_csv(snakemake.input["targets"]) -day_segments_labels = pd.read_csv(snakemake.input["day_segments_labels"], header=0) +time_segments_labels = pd.read_csv(snakemake.input["time_segments_labels"], header=0) all_targets = pd.DataFrame(columns=["local_segment"]) -for day_segment in day_segments_labels["label"]: - filtered_targets = filter_data_by_segment(targets, day_segment) +for time_segment in time_segments_labels["label"]: + filtered_targets = filter_data_by_segment(targets, time_segment) all_targets = all_targets.merge(filtered_targets, how="outer") segment_colums = pd.DataFrame() diff --git a/tests/scripts/utils.py b/tests/scripts/utils.py index 30c1a3c1..e5db9cda 100644 --- a/tests/scripts/utils.py +++ b/tests/scripts/utils.py @@ -49,11 +49,11 @@ def generate_file_list(configs, sensor): # i.e. The sensor passed into the function. # Initialize string of file path for both expected and actual metric values - act_str = "data/processed/features/{pid}/{sensor}_{sensor_type}{day_segment}.csv" - exp_str = "tests/data/processed/features/period/{pid}/{sensor}_{sensor_type}{day_segment}.csv" + act_str = "data/processed/features/{pid}/{sensor}_{sensor_type}{time_segment}.csv" + exp_str = "tests/data/processed/features/period/{pid}/{sensor}_{sensor_type}{time_segment}.csv" sensor_cap = sensor.upper() - if 'DAY_SEGMENTS' and 'FEATURES' in configs[sensor_cap]: + if 'TIME_SEGMENTS' and 'FEATURES' in configs[sensor_cap]: sensor_type = [] if 'TYPES' in configs[sensor_cap]: for each in configs[sensor_cap]['TYPES']: @@ -62,24 +62,24 @@ def generate_file_list(configs, sensor): act_file_list = expand(act_str,pid=configs["PIDS"], sensor = sensor, sensor_type = sensor_type, - day_segment = configs[sensor_cap]["DAY_SEGMENTS"]) + time_segment = configs[sensor_cap]["TIME_SEGMENTS"]) exp_file_list = expand(exp_str,pid=configs["PIDS"], sensor = sensor, sensor_type = sensor_type, - day_segment = configs[sensor_cap]["DAY_SEGMENTS"]) + time_segment = configs[sensor_cap]["TIME_SEGMENTS"]) return zip(act_file_list, exp_file_list) def generate_sensor_file_lists(configs): # Go through the configs and select those sensors with COMPUTE = True. - # Also get DAY_SEGMENTS, and optionally TYPES then create expected + # Also get TIME_SEGMENTS, and optionally TYPES then create expected # files. Return dictionary with list of file paths of expected and # actual files for each sensor listed in the config file. Added for Travis. # Initialize string of file path for both expected and actual metric values - segment = configs['DAY_SEGMENTS']['TYPE'].lower() + segment = configs['TIME_SEGMENTS']['TYPE'].lower() print(segment) act_str = "data/processed/features/"+segment+"/{pid}/{sensor_key}.csv" exp_str = "tests/data/processed/features/"+segment+"/{pid}/{sensor_key}.csv" diff --git a/tests/settings/frequency/config.yaml b/tests/settings/frequency/config.yaml index 35053d1e..ca2ae708 100644 --- a/tests/settings/frequency/config.yaml +++ b/tests/settings/frequency/config.yaml @@ -2,4 +2,4 @@ directory: ./ configfile: ./tests/settings/frequency/testing_config.yaml snakefile: ./tests/Snakefile cores: 1 -forcerun: [compute_day_segments, join_features_from_providers] \ No newline at end of file +forcerun: [compute_time_segments, join_features_from_providers] \ No newline at end of file diff --git a/tests/settings/frequency/testing_config.yaml b/tests/settings/frequency/testing_config.yaml index f3fd5f0e..54d2e379 100644 --- a/tests/settings/frequency/testing_config.yaml +++ b/tests/settings/frequency/testing_config.yaml @@ -2,11 +2,11 @@ # You must create a file for each participant named pXXX containing their device_id. This can be done manually or automatically PIDS: [test03, test04,test05, test06] -# Global var with common day segments -DAY_SEGMENTS: &day_segments +# Global var with common time segments +TIME_SEGMENTS: &time_segments TYPE: FREQUENCY # FREQUENCY, PERIODIC, EVENT - FILE: "data/external/daysegments_frequency.csv" - INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, if set to TRUE we consider day segments back enough in the past as to include the first day of data + FILE: "data/external/timesegments_frequency.csv" + INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, if set to TRUE we consider time segments back enough in the past as to include the first day of data # Use tz codes from https://en.wikipedia.org/wiki/List_of_tz_database_time_zones. Double check your code, for example EST is not US Eastern Time. TIMEZONE: &timezone diff --git a/tests/settings/periodic/config.yaml b/tests/settings/periodic/config.yaml index 40a2188e..739120f0 100644 --- a/tests/settings/periodic/config.yaml +++ b/tests/settings/periodic/config.yaml @@ -2,4 +2,4 @@ directory: ./ configfile: ./tests/settings/periodic/testing_config.yaml snakefile: ./tests/Snakefile cores: 1 -forcerun: [compute_day_segments, join_features_from_providers] \ No newline at end of file +forcerun: [compute_time_segments, join_features_from_providers] \ No newline at end of file diff --git a/tests/settings/periodic/testing_config.yaml b/tests/settings/periodic/testing_config.yaml index e59253cc..5fe9c839 100644 --- a/tests/settings/periodic/testing_config.yaml +++ b/tests/settings/periodic/testing_config.yaml @@ -2,11 +2,11 @@ # You must create a file for each participant named pXXX containing their device_id. This can be done manually or automatically PIDS: [test01, test02, test03, test04] -# Global var with common day segments -DAY_SEGMENTS: &day_segments +# Global var with common time segments +TIME_SEGMENTS: &time_segments TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT - FILE: "data/external/daysegments_periodic.csv" - INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, if set to TRUE we consider day segments back enough in the past as to include the first day of data + FILE: "data/external/timesegments_periodic.csv" + INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, if set to TRUE we consider time segments back enough in the past as to include the first day of data # Use tz codes from https://en.wikipedia.org/wiki/List_of_tz_database_time_zones. Double check your code, for example EST is not US Eastern Time. TIMEZONE: &timezone