Add support for multiple time zones

pull/128/head
JulioV 2021-03-05 17:49:37 -05:00
parent f53b74e280
commit fb054b539f
18 changed files with 381 additions and 125 deletions

View File

@ -33,7 +33,17 @@ TIME_SEGMENTS: &time_segments
FILE: "data/external/timesegments_periodic.csv" FILE: "data/external/timesegments_periodic.csv"
INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, see docs INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, see docs
TIMEZONE:
TYPE: MULTIPLE
SINGLE:
TZCODE: America/New_York
MULTIPLE:
TZCODES_FILE: data/external/multiple_timezones_example.csv
IF_MISSING_TZCODE: USE_DEFAULT
DEFAULT_TZCODE: America/Los_Angeles
FITBIT:
ALLOW_MULTIPLE_TZ_PER_DEVICE: False
INFER_FROM_SMARTPHONE_TZ: False
######################################################################################################################## ########################################################################################################################
# PHONE # # PHONE #
@ -263,7 +273,7 @@ PHONE_LOCATIONS:
COMPUTE: False COMPUTE: False
FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"] FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"]
ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius
TIMEZONE: *timezone IF_MULTIPLE_TIMEZONES: USE_MOST_COMMON
MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features
SRC_FOLDER: "barnett" # inside src/features/phone_locations SRC_FOLDER: "barnett" # inside src/features/phone_locations
SRC_LANGUAGE: "r" SRC_LANGUAGE: "r"

View File

@ -5,15 +5,27 @@ Sensor parameters description for `[PHONE_LOCATIONS]`:
|Key                                                                                        | Description | |Key                                                                                        | Description |
|----------------|----------------------------------------------------------------------------------------------------------------------------------- |----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Database table where the location data is stored |`[TABLE]`| Database table where the location data is stored
|`[LOCATIONS_TO_USE]`| Type of location data to use, one of `ALL`, `GPS`, `ALL_RESAMPLED` or `FUSED_RESAMPLED`. This filter is based on the `provider` column of the AWARE locations table, `ALL` includes every row, `GPS` only includes rows where provider is gps, `ALL_RESAMPLED` includes all rows after being resampled, and `FUSED_RESAMPLED` only includes rows where provider is fused after being resampled. |`[LOCATIONS_TO_USE]`| Type of location data to use, one of `ALL`, `GPS`, `ALL_RESAMPLED` or `FUSED_RESAMPLED`. This filter is based on the `provider` column of the locations table, `ALL` includes every row, `GPS` only includes rows where the provider is gps, `ALL_RESAMPLED` includes all rows after being resampled, and `FUSED_RESAMPLED` only includes rows where the provider is fused after being resampled.
|`[FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD]`| if `ALL_RESAMPLED` or `FUSED_RESAMPLED` is used, the original fused data has to be resampled, a location row will be resampled to the next valid timestamp (see the Assumptions/Observations below) only if the time difference between them is less or equal than this threshold (in minutes). |`[FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD]`| if `ALL_RESAMPLED` or `FUSED_RESAMPLED` is used, the original fused data has to be resampled, a location row is resampled to the next valid timestamp (see the Assumptions/Observations below) only if the time difference between them is less or equal than this threshold (in minutes).
|`[FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION]`| if `ALL_RESAMPLED` or `FUSED_RESAMPLED` is used, the original fused data has to be resampled, a location row will be resampled at most for this long (in minutes) |`[FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION]`| if `ALL_RESAMPLED` or `FUSED_RESAMPLED` is used, the original fused data has to be resampled, a location row is resampled at most for this long (in minutes)
!!! note "Assumptions/Observations" !!! note "Assumptions/Observations"
**Types of location data to use** **Types of location data to use**
AWARE Android and iOS clients can collect location coordinates through the phone\'s GPS, the network cellular towers around the phone, or Google\'s fused location API. If you want to use only the GPS provider set `[LOCATIONS_TO_USE]` to `GPS`, if you want to use all providers set `[LOCATIONS_TO_USE]` to `ALL`, if you collected location data from different providers including the fused API use `ALL_RESAMPLED`, if your AWARE client was configured to use fused location only or want to focus only on this provider, set `[LOCATIONS_TO_USE]` to `RESAMPLE_FUSED`. `ALL_RESAMPLED` and `RESAMPLE_FUSED` take the original location coordinates and replicate each pair forward in time as long as the phone was sensing data as indicated by the joined timestamps of [`[PHONE_DATA_YIELD][SENSORS]`](../phone-data-yield/), this is done because Google\'s API only logs a new location coordinate pair when it is sufficiently different in time or space from the previous one and because GPS and network providers can log data at variable rates. Android and iOS clients can collect location coordinates through the phone's GPS, the network cellular towers around the phone, or Google's fused location API.
There are two parameters associated with resampling fused location. `FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair (for example, participant A\'s phone did not collect data between 10.30am and 10:50am and between 11:05am and 11:40am, the last known coordinate pair will be replicated during the first period but not the second, in other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes). `FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION` (in minutes, default 720 or 12 hours) stops the last known fused location from being replicated longer that this threshold even if the phone was sensing data continuously (for example, participant A went home at 9pm and their phone was sensing data without gaps until 11am the next morning, the last known location will only be replicated until 9am). If you have suggestions to modify or improve this resampling, let us know. - If you want to use only the GPS provider, set `[LOCATIONS_TO_USE]` to `GPS`
- If you want to use all providers, set `[LOCATIONS_TO_USE]` to `ALL`
- If you collected location data from different providers, including the fused API, use `ALL_RESAMPLED`
- If your mobile client was configured to use fused location only or want to focus only on this provider, set `[LOCATIONS_TO_USE]` to `RESAMPLE_FUSED`.
`ALL_RESAMPLED` and `RESAMPLE_FUSED` take the original location coordinates and replicate each pair forward in time as long as the phone was sensing data as indicated by the joined timestamps of [`[PHONE_DATA_YIELD][SENSORS]`](../phone-data-yield/). This is done because Google's API only logs a new location coordinate pair when it is sufficiently different in time or space from the previous one and because GPS and network providers can log data at variable rates.
There are two parameters associated with resampling fused location.
1. `FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair. For example, participant A's phone did not collect data between 10.30 am and 10:50 am and between 11:05am and 11:40am, the last known coordinate pair is replicated during the first period but not the second. In other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes.
2. `FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION` (in minutes, default 720 or 12 hours) stops the last known fused location from being replicated longer than this threshold even if the phone was sensing data continuously. For example, participant A went home at 9 pm, and their phone was sensing data without gaps until 11 am the next morning, the last known location is replicated until 9 am.
If you have suggestions to modify or improve this resampling, let us know.
## BARNETT provider ## BARNETT provider
@ -36,13 +48,13 @@ These features are based on the original open-source implementation by [Barnett
Parameters description for `[PHONE_LOCATIONS][PROVIDERS][BARNETT]`: Parameters description for `[PHONE_LOCATIONS][PROVIDERS][BARNETT]`:
|Key                                          | Description | |Key                                            | Description |
|----------------|----------------------------------------------------------------------------------------------------------------------------------- |----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[COMPUTE]`| Set to `True` to extract `PHONE_LOCATIONS` features from the `BARNETT` provider| |`[COMPUTE]`| Set to `True` to extract `PHONE_LOCATIONS` features from the `BARNETT` provider|
|`[FEATURES]` | Features to be computed, see table below |`[FEATURES]` | Features to be computed, see table below
|`[ACCURACY_LIMIT]` | An integer in meters, any location rows with an accuracy higher than this will be dropped. This number means there's a 68% probability the true location is within this radius |`[ACCURACY_LIMIT]` | An integer in meters, any location rows with an accuracy higher than this is dropped. This number means there's a 68% probability the actual location is within this radius
|`[TIMEZONE]` | Timezone where the location data was collected. By default points to the one defined in the [Configuration](../../setup/configuration#timezone-of-your-study) |`[IF_MULTIPLE_TIMEZONES]` | Currently, `USE_MOST_COMMON` is the only value supported. If the location data for a participant belongs to multiple time zones, we select the most common because Barnett's algorithm can only handle one time zone
|`[MINUTES_DATA_USED]` | Set to `True` to include an extra column in the final location feature file containing the number of minutes used to compute the features on each time segment. Use this for quality control purposes, the more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough. |`[MINUTES_DATA_USED]` | Set to `True` to include an extra column in the final location feature file containing the number of minutes used to compute the features on each time segment. Use this for quality control purposes; the more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough.
@ -50,9 +62,9 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][BARNETT]` adapted from [B
|Feature |Units |Description| |Feature |Units |Description|
|-------------------------- |---------- |---------------------------| |-------------------------- |---------- |---------------------------|
|hometime |minutes | Time at home. Time spent at home in minutes. Home is the most visited significant location between 8 pm and 8 am including any pauses within a 200-meter radius. |hometime |minutes | Time at home. Time spent at home in minutes. Home is the most visited significant location between 8 pm and 8 am, including any pauses within a 200-meter radius.
|disttravelled |meters | Total distance travelled over a day (flights). |disttravelled |meters | Total distance traveled over a day (flights).
|rog |meters | The Radius of Gyration (rog) is a measure in meters of the area covered by a person over a day. A centroid is calculated for all the places (pauses) visited during a day and a weighted distance between all the places and that centroid is computed. The weights are proportional to the time spent in each place. |rog |meters | The Radius of Gyration (rog) is a measure in meters of the area covered by a person over a day. A centroid is calculated for all the places (pauses) visited during a day, and a weighted distance between all the places and that centroid is computed. The weights are proportional to the time spent in each place.
|maxdiam |meters | The maximum diameter is the largest distance between any two pauses. |maxdiam |meters | The maximum diameter is the largest distance between any two pauses.
|maxhomedist |meters | The maximum distance from home in meters. |maxhomedist |meters | The maximum distance from home in meters.
|siglocsvisited |locations | The number of significant locations visited during the day. Significant locations are computed using k-means clustering over pauses found in the whole monitoring period. The number of clusters is found iterating k from 1 to 200 stopping until the centroids of two significant locations are within 400 meters of one another. |siglocsvisited |locations | The number of significant locations visited during the day. Significant locations are computed using k-means clustering over pauses found in the whole monitoring period. The number of clusters is found iterating k from 1 to 200 stopping until the centroids of two significant locations are within 400 meters of one another.
@ -61,16 +73,16 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][BARNETT]` adapted from [B
|avgflightdur |seconds | Mean duration of all flights. |avgflightdur |seconds | Mean duration of all flights.
|stdflightdur |seconds | The standard deviation of the duration of all flights. |stdflightdur |seconds | The standard deviation of the duration of all flights.
|probpause | - | The fraction of a day spent in a pause (as opposed to a flight) |probpause | - | The fraction of a day spent in a pause (as opposed to a flight)
|siglocentropy |nats | Shannons entropy measurement based on the proportion of time spent at each significant location visited during a day. |siglocentropy |nats | Shannon's entropy measurement is based on the proportion of time spent at each significant location visited during a day.
|circdnrtn | - | A continuous metric quantifying a persons circadian routine that can take any value between 0 and 1, where 0 represents a daily routine completely different from any other sensed days and 1 a routine the same as every other sensed day. |circdnrtn | - | A continuous metric quantifying a person's circadian routine that can take any value between 0 and 1, where 0 represents a daily routine completely different from any other sensed days and 1 a routine the same as every other sensed day.
|wkenddayrtn | - | Same as circdnrtn but computed separately for weekends and weekdays. |wkenddayrtn | - | Same as circdnrtn but computed separately for weekends and weekdays.
!!! note "Assumptions/Observations" !!! note "Assumptions/Observations"
**Barnett\'s et al features** **Barnett\'s et al features**
These features are based on a Pause-Flight model. A pause is defined as a mobiity trace (location pings) within a certain duration and distance (by default 300 seconds and 60 meters). A flight is any mobility trace between two pauses. Data is resampled and imputed before the features are computed. See [Barnett et al](../../citation#barnett-locations) for more information. In RAPIDS we only expose two parameters for these features (timezone and accuracy limit). You can change other parameters in `src/features/phone_locations/barnett/library/MobilityFeatures.R`. These features are based on a Pause-Flight model. A pause is defined as a mobility trace (location pings) within a certain duration and distance (by default, 300 seconds and 60 meters). A flight is any mobility trace between two pauses. Data is resampled and imputed before the features are computed. See [Barnett et al](../../citation#barnett-locations) for more information. In RAPIDS, we only expose one parameter for these features (accuracy limit). You can change other parameters in `src/features/phone_locations/barnett/library/MobilityFeatures.R`.
**Significant Locations** **Significant Locations**
Significant locations are determined using K-means clustering on pauses longer than 10 minutes. The number of clusters (K) is increased until no two clusters are within 400 meters from each other. After this, pauses within a certain range of a cluster (200 meters by default) will count as a visit to that significant location. This description was adapted from the Supplementary Materials of [Barnett et al](../../citation#barnett-locations). Significant locations are determined using K-means clustering on pauses longer than 10 minutes. The number of clusters (K) is increased until no two clusters are within 400 meters from each other. After this, pauses within a certain range of a cluster (200 meters by default) count as a visit to that significant location. This description was adapted from the Supplementary Materials of [Barnett et al](../../citation#barnett-locations).
**The Circadian Calculation** **The Circadian Calculation**
For a detailed description of how this is calculated, see [Canzian et al](../../citation#barnett-locations). For a detailed description of how this is calculated, see [Canzian et al](../../citation#barnett-locations).
@ -105,12 +117,12 @@ Parameters description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
| `[DBSCAN_EPS]` | The maximum distance in meters between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. | `[DBSCAN_EPS]` | The maximum distance in meters between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.
| `[DBSCAN_MINSAMPLES]` | The number of samples (or total weight) in a neighborhood for a point to be considered as a core point of a cluster. This includes the point itself. | `[DBSCAN_MINSAMPLES]` | The number of samples (or total weight) in a neighborhood for a point to be considered as a core point of a cluster. This includes the point itself.
| `[THRESHOLD_STATIC]` | It is the threshold value in km/hr which labels a row as Static or Moving. | `[THRESHOLD_STATIC]` | It is the threshold value in km/hr which labels a row as Static or Moving.
| `[MAXIMUM_ROW_GAP]` | The maximum gap (in seconds) allowed between any two consecutive rows for them to be considered part of the same displacement. If this threshold is too high, it can throw speed and distance calculations off for periods when the the phone was not sensing. | `[MAXIMUM_ROW_GAP]` | The maximum gap (in seconds) allowed between any two consecutive rows for them to be considered part of the same displacement. If this threshold is too high, it can throw speed and distance calculations off for periods when the phone was not sensing.
| `[MAXIMUM_ROW_DURATION]` | The time difference between any two consecutive rows `A` and `B` is considered as the time a participant spent in `A`. If this difference is bigger than MAXIMUM_ROW_GAP we will substitute it with `MAXIMUM_ROW_DURATION`. | `[MAXIMUM_ROW_DURATION]` | The time difference between any two consecutive rows `A` and `B` is considered as the time a participant spent in `A`. If this difference is bigger than MAXIMUM_ROW_GAP we substitute it with `MAXIMUM_ROW_DURATION`.
| `[MINUTES_DATA_USED]` | Set to `True` to include an extra column in the final location feature file containing the number of minutes used to compute the features on each time segment. Use this for quality control purposes, the more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough. | `[MINUTES_DATA_USED]` | Set to `True` to include an extra column in the final location feature file containing the number of minutes used to compute the features on each time segment. Use this for quality control purposes; the more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough.
| `[SAMPLING_FREQUENCY]` | Expected time difference between any two location rows in minutes. If set to `0`, the sampling frequency will be inferred automatically as the median of all the differences between any two consecutive row timestamps (recommended if you are using `FUSED_RESAMPLED` data). This parameter impacts all the time calculations. | `[SAMPLING_FREQUENCY]` | Expected time difference between any two location rows in minutes. If set to `0`, the sampling frequency will be inferred automatically as the median of all the differences between two consecutive row timestamps (recommended if you are using `FUSED_RESAMPLED` data). This parameter impacts all the time calculations.
| `[CLUSTER_ON]` | Set this flag to `PARTICIPANT_DATASET` to create clusters based on the entire participant's dataset or to `TIME_SEGMENT` to create clusters based on all the instances of the corresponding time segment (e.g. all mornings). | `[CLUSTER_ON]` | Set this flag to `PARTICIPANT_DATASET` to create clusters based on the entire participant's dataset or to `TIME_SEGMENT` to create clusters based on all the instances of the corresponding time segment (e.g. all mornings).
| `[CLUSTERING_ALGORITHM]` | The original Doryab et al implementation uses `DBSCAN`, `OPTICS` is also available with similar (but not identical) clustering results and lower memory consumption. | `[CLUSTERING_ALGORITHM]` | The original Doryab et al. implementation uses `DBSCAN`, `OPTICS` is also available with similar (but not identical) clustering results and lower memory consumption.
| `[RADIUS_FOR_HOME]` | All location coordinates within this distance (meters) from the home location coordinates are considered a homestay (see `timeathome` feature). | `[RADIUS_FOR_HOME]` | All location coordinates within this distance (meters) from the home location coordinates are considered a homestay (see `timeathome` feature).
@ -120,24 +132,24 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
|-------------------------- |---------- |---------------------------| |-------------------------- |---------- |---------------------------|
|locationvariance |$meters^2$ |The sum of the variances of the latitude and longitude columns. |locationvariance |$meters^2$ |The sum of the variances of the latitude and longitude columns.
|loglocationvariance | - | Log of the sum of the variances of the latitude and longitude columns. |loglocationvariance | - | Log of the sum of the variances of the latitude and longitude columns.
|totaldistance |meters |Total distance travelled in a time segment using the haversine formula. |totaldistance |meters |Total distance traveled in a time segment using the haversine formula.
|averagespeed |km/hr |Average speed in a time segment considering only the instances labeled as Moving. |averagespeed |km/hr |Average speed in a time segment considering only the instances labeled as Moving.
|varspeed |km/hr |Speed variance in a time segment considering only the instances labeled as Moving. |varspeed |km/hr |Speed variance in a time segment considering only the instances labeled as Moving.
|{--circadianmovement--} |- | Not suggested for use at the moment, see Observations below. \"It encodes the extent to which a person's location patterns follow a 24-hour circadian cycle.\" [Doryab et al.](../../citation#doryab-locations). |{--circadianmovement--} |- | Not suggested for use now; see Observations below. \ "It encodes the extent to which a person's location patterns follow a 24-hour circadian cycle.\" [Doryab et al.](../../citation#doryab-locations).
|numberofsignificantplaces |places |Number of significant locations visited. It is calculated using the DBSCAN/OPTICS clustering algorithm which takes in EPS and MIN_SAMPLES as parameters to identify clusters. Each cluster is a significant place. |numberofsignificantplaces |places |Number of significant locations visited. It is calculated using the DBSCAN/OPTICS clustering algorithm which takes in EPS and MIN_SAMPLES as parameters to identify clusters. Each cluster is a significant place.
|numberlocationtransitions |transitions |Number of movements between any two clusters in a time segment. |numberlocationtransitions |transitions |Number of movements between any two clusters in a time segment.
|radiusgyration |meters |Quantifies the area covered by a participant |radiusgyration |meters |Quantifies the area covered by a participant
|timeattop1location |minutes |Time spent at the most significant location. |timeattop1location |minutes |Time spent at the most significant location.
|timeattop2location |minutes |Time spent at the 2nd most significant location. |timeattop2location |minutes |Time spent at the 2nd most significant location.
|timeattop3location |minutes |Time spent at the 3rd most significant location. |timeattop3location |minutes |Time spent at the 3rd most significant location.
|movingtostaticratio | - | Ratio between stationary time and total location sensed time. A lat/long coordinate pair is labelled as stationary if its speed (distance/time) to the next coordinate pair is less than 1km/hr. A higher value represents a more stationary routine. These times are computed using timeInSeconds feature. |movingtostaticratio | - | Ratio between stationary time and total location sensed time. A lat/long coordinate pair is labeled as stationary if its speed (distance/time) to the next coordinate pair is less than 1km/hr. A higher value represents a more stationary routine. These times are computed using timeInSeconds feature.
|outlierstimepercent | - | Ratio between the time spent in non-significant clusters divided by the time spent in all clusters (total location sensed time). A higher value represents more time spent in non-significant clusters. These times are computed using timeInSeconds feature. |outlierstimepercent | - | Ratio between the time spent in non-significant clusters divided by the time spent in all clusters (total location sensed time). A higher value represents more time spent in non-significant clusters. These times are computed using timeInSeconds feature.
|maxlengthstayatclusters |minutes |Maximum time spent in a cluster (significant location). |maxlengthstayatclusters |minutes |Maximum time spent in a cluster (significant location).
|minlengthstayatclusters |minutes |Minimum time spent in a cluster (significant location). |minlengthstayatclusters |minutes |Minimum time spent in a cluster (significant location).
|meanlengthstayatclusters |minutes |Average time spent in a cluster (significant location). |meanlengthstayatclusters |minutes |Average time spent in a cluster (significant location).
|stdlengthstayatclusters |minutes |Standard deviation of time spent in a cluster (significant location). |stdlengthstayatclusters |minutes |Standard deviation of time spent in a cluster (significant location).
|locationentropy |nats |Shannon Entropy computed over the row count of each cluster (significant location), it will be higher the more rows belong to a cluster (i.e. the more time a participant spent at a significant location). |locationentropy |nats |Shannon Entropy computed over the row count of each cluster (significant location), it is higher the more rows belong to a cluster (i.e., the more time a participant spent at a significant location).
|normalizedlocationentropy |nats |Shannon Entropy computed over the row count of each cluster (significant location) divided by the number of clusters, it will be higher the more rows belong to a cluster (i.e. the more time a participant spent at a significant location). |normalizedlocationentropy |nats |Shannon Entropy computed over the row count of each cluster (significant location) divided by the number of clusters; it is higher the more rows belong to a cluster (i.e., the more time a participant spent at a significant location).
|timeathome |minutes | Time spent at home (see Observations below for a description on how we compute home). |timeathome |minutes | Time spent at home (see Observations below for a description on how we compute home).
@ -146,13 +158,13 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
Significant locations are determined using DBSCAN clustering on locations that a patient visit over the course of the period of data collection. Significant locations are determined using DBSCAN clustering on locations that a patient visit over the course of the period of data collection.
**Circadian Movement Calculation** **Circadian Movement Calculation**
Note Feb 3 2021. It seems the implementation of this feature is not correct, we suggest not to use this feature until a fix is in place. For a detailed description of how this should be calculated, see [Saeb et al](https://pubmed.ncbi.nlm.nih.gov/28344895/). Note Feb 3 2021. It seems the implementation of this feature is not correct; we suggest not to use this feature until a fix is in place. For a detailed description of how this should be calculated, see [Saeb et al](https://pubmed.ncbi.nlm.nih.gov/28344895/).
**Fine Tuning Clustering Parameters** **Fine-Tuning Clustering Parameters**
Based on an experiment where we collected fused location data for 7 days with a mean accuracy of 86 & SD of 350.874635, we determined that `EPS/MAX_EPS`=100 produced closer clustering results to reality. Higher values (>100) missed out some significant places like a short grocery visit while lower values (<100) picked up traffic lights and stop signs while driving as significant locations. We recommend you set `EPS` based on the accuracy of your location data (the more accurate your data is, the lower you should be able to set EPS). Based on an experiment where we collected fused location data for 7 days with a mean accuracy of 86 & SD of 350.874635, we determined that `EPS/MAX_EPS`=100 produced closer clustering results to reality. Higher values (>100) missed out on some significant places, like a short grocery visit, while lower values (<100) picked up traffic lights and stop signs while driving as significant locations. We recommend you set `EPS` based on your location data's accuracy (the more accurate your data is, the lower you should be able to set EPS).
**Duration Calculation** **Duration Calculation**
To calculate the time duration component for our features, we compute the difference between the timestamps of consecutive rows to take into account sampling rate variability. If this time difference is larger than a threshold (300 seconds by default) we replace it with a maximum duration (60 seconds by default, i.e. we assume a participant spent at least 60 seconds in their last known location) To calculate the time duration component for our features, we compute the difference between consecutive rows' timestamps to take into account sampling rate variability. If this time difference is larger than a threshold (300 seconds by default), we replace it with a maximum duration (60 seconds by default, i.e., we assume a participant spent at least 60 seconds in their last known location)
**Home location** **Home location**
Home is calculated using all location data of a participant between 12 am and 6 am, then applying a clustering algorithm (`DB_SCAN` or `OPTICS`), and considering the center of the biggest cluster as the home coordinates for that participant. Home is calculated using all location data of a participant between 12 am and 6 am, then applying a clustering algorithm (`DB_SCAN` or `OPTICS`) and considering the center of the biggest cluster home for that participant.

View File

@ -21,7 +21,7 @@ When you are done with this configuration, go to [executing RAPIDS](../execution
A data stream refers to sensor data collected using a specific type of **device** with a specific **format** and stored in a specific **container**. For example, the `aware_mysql` data stream handles smartphone data (**device**) collected with the [AWARE Framework](https://awareframework.com/) (**format**) stored in a MySQL database (**container**). A data stream refers to sensor data collected using a specific type of **device** with a specific **format** and stored in a specific **container**. For example, the `aware_mysql` data stream handles smartphone data (**device**) collected with the [AWARE Framework](https://awareframework.com/) (**format**) stored in a MySQL database (**container**).
Check the table in [introduction to data streams](../../datastreams/data-streams-introduction) to know what data streams we support. If your data stream is supported, continue to the next configuration section. If you want to implement a new data stream, follow this tutorial to [add support for new data streams](../../datastreams/add-new-data-streams). If you have read the tutorial but have questions, get in touch by email or in Slack. Check the table in [introduction to data streams](../../datastreams/data-streams-introduction) to know what data streams we support. If your data stream is supported, continue to the next configuration section, **you will use its label later in this guide** (e.g. `aware_mysql`). If your steam is not supported but you want to implement it, follow this tutorial to [add support for new data streams](../../datastreams/add-new-data-streams) and get in touch by email or in Slack if you have any questions.
--- ---
@ -350,7 +350,7 @@ TIMEZONE:
MULTIPLE: MULTIPLE:
TZCODES_FILE: path_to/time_zones_csv.file TZCODES_FILE: path_to/time_zones_csv.file
IF_MISSING_TZCODE: STOP IF_MISSING_TZCODE: STOP
DEFAULT: America/New_York DEFAULT_TZCODE: America/New_York
FITBIT: FITBIT:
ALLOW_MULTIPLE_TZ_PER_DEVICE: False ALLOW_MULTIPLE_TZ_PER_DEVICE: False
INFER_FROM_SMARTPHONE_TZ: False INFER_FROM_SMARTPHONE_TZ: False
@ -363,7 +363,7 @@ Parameters for `[TIMEZONE]`
|`[TYPE]`| Either `SINGLE` or `MULTIPLE` as explained above | |`[TYPE]`| Either `SINGLE` or `MULTIPLE` as explained above |
|`[SINGLE][TZCODE]`| The time zone code from this [list](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) to be used across all devices | |`[SINGLE][TZCODE]`| The time zone code from this [list](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) to be used across all devices |
|`[MULTIPLE][TZCODES_FILE]`| A CSV file containing the time and code from this [list](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) visited by each device in the study. Multiple devices can be linked to the same person, read more in [Participants Files](#participant-files) | |`[MULTIPLE][TZCODES_FILE]`| A CSV file containing the time and code from this [list](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) visited by each device in the study. Multiple devices can be linked to the same person, read more in [Participants Files](#participant-files) |
|`[MULTIPLE][IF_MISSING_TZCODE]`| When a device is missing from `[TZCODES_FILE]` Set this flag to `STOP` to stop RAPIDS execution and show an error, or to `USE_DEFAULT` to assign the time zone specified in `[DEFAULT]` to any such devices | |`[MULTIPLE][IF_MISSING_TZCODE]`| When a device is missing from `[TZCODES_FILE]` Set this flag to `STOP` to stop RAPIDS execution and show an error, or to `USE_DEFAULT` to assign the time zone specified in `[DEFAULT_TZCODE]` to any such devices |
|`[MULTIPLE][FITBIT][ALLOW_MULTIPLE_TZ_PER_DEVICE]`| You only need to care about this flag if one or more Fitbit devices sensed data in one or more |`[MULTIPLE][FITBIT][ALLOW_MULTIPLE_TZ_PER_DEVICE]`| You only need to care about this flag if one or more Fitbit devices sensed data in one or more
time zone, and you want RAPIDS to take into account this in its feature computation. Read more in "How does RAPIDS handle Fitbit devices?" below. | time zone, and you want RAPIDS to take into account this in its feature computation. Read more in "How does RAPIDS handle Fitbit devices?" below. |
|`[MULTIPLE][FITBIT][INFER_FROM_SMARTPHONE_TZ]`| You only need to care about this flag if one or more Fitbit devices sensed data in one or more |`[MULTIPLE][FITBIT][INFER_FROM_SMARTPHONE_TZ]`| You only need to care about this flag if one or more Fitbit devices sensed data in one or more
@ -375,11 +375,11 @@ time zone, and you want RAPIDS to take into account this in its feature computat
|Column | Description | |Column | Description |
|--|--| |--|--|
|`device_id`|A string that uniquely identifies a smartphone or wearable| |`device_id`|A string that uniquely identifies a smartphone or wearable|
|`tz_code`| A string with the appropriate code from this [list](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) that represents the time zone where the `device` sensed data| |`tzcode`| A string with the appropriate code from this [list](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) that represents the time zone where the `device` sensed data|
|`timestamp`| A UNIX timestamp indicating when was the first time this `device_id` sensed data in `tz_code`| |`timestamp`| A UNIX timestamp indicating when was the first time this `device_id` sensed data in `tzcode`|
```csv ```csv
device_id, tz_code, timestamp device_id, tzcode, timestamp
13dbc8a3-dae3-4834-823a-4bc96a7d459d, America/New_York, 1587500000000 13dbc8a3-dae3-4834-823a-4bc96a7d459d, America/New_York, 1587500000000
13dbc8a3-dae3-4834-823a-4bc96a7d459d, America/Mexico_City, 1587600000000 13dbc8a3-dae3-4834-823a-4bc96a7d459d, America/Mexico_City, 1587600000000
13dbc8a3-dae3-4834-823a-4bc96a7d459d, America/Los_Angeles, 1587700000000 13dbc8a3-dae3-4834-823a-4bc96a7d459d, America/Los_Angeles, 1587700000000
@ -402,7 +402,7 @@ time zone, and you want RAPIDS to take into account this in its feature computat
??? note "What happens if participant X lives in Los Angeles but participant Y lives in Amsterdam and they both stayed there during my study?" ??? note "What happens if participant X lives in Los Angeles but participant Y lives in Amsterdam and they both stayed there during my study?"
Add a row per participant and set timestamp to `0`: Add a row per participant and set timestamp to `0`:
```csv ```csv
device_id, tz_code, timestamp device_id, tzcode, timestamp
13dbc8a3-dae3-4834-823a-4bc96a7d459d, America/Los_Angeles, 0 13dbc8a3-dae3-4834-823a-4bc96a7d459d, America/Los_Angeles, 0
65sa66a5-2d2d-4524-946v-44ascbv4sad7, Europe/Amsterdam, 0 65sa66a5-2d2d-4524-946v-44ascbv4sad7, Europe/Amsterdam, 0
``` ```
@ -412,14 +412,14 @@ time zone, and you want RAPIDS to take into account this in its feature computat
If `[IF_MISSING_TZCODE]` is set to `STOP`, RAPIDS will stop its execution and show you an error message. If `[IF_MISSING_TZCODE]` is set to `STOP`, RAPIDS will stop its execution and show you an error message.
If `[IF_MISSING_TZCODE]` is set to `USE_DEFAULT`, it will assign the time zone specified in `[DEFAULT]` to any devices with missing time zone information in `[TZCODES_FILE]`. This is helpful if only a few of your participants had multiple timezones and you don't want to specify the same time zone for the rest. If `[IF_MISSING_TZCODE]` is set to `USE_DEFAULT`, it will assign the time zone specified in `[DEFAULT_TZCODE]` to any devices with missing time zone information in `[TZCODES_FILE]`. This is helpful if only a few of your participants had multiple timezones and you don't want to specify the same time zone for the rest.
??? note "How does RAPIDS handle Fitbit devices?" ??? note "How does RAPIDS handle Fitbit devices?"
Fitbit devices are not time zone aware and they always log data with a local date-time string. Fitbit devices are not time zone aware and they always log data with a local date-time string.
- When none of the Fitbit devices in your study changed time zones (e.g., `p01` was always in New York and `p02` as always in Amsterdam), you can set a single time zone per Fitbit device id along with a timestamp 0 (you can still assign multiple time zones to smartphone device ids) - When none of the Fitbit devices in your study changed time zones (e.g., `p01` was always in New York and `p02` as always in Amsterdam), you can set a single time zone per Fitbit device id along with a timestamp 0 (you can still assign multiple time zones to smartphone device ids)
```csv ```csv
device_id, tz_code, timestamp device_id, tzcode, timestamp
fitbit123, America/New_York, 0 fitbit123, America/New_York, 0
fitbit999, Europe/Amsterdam, 0 fitbit999, Europe/Amsterdam, 0
``` ```

View File

@ -67,3 +67,13 @@ def download_phone_data_input_with_mutation_scripts(wilcards):
raise ValueError("Mutate scripts can only be Python or R scripts (.py, .R).\n Instead we got {script} in \n [{sensor}][{device_os}] of {schema}".format(script=script, sensor=sensor, device_os=device_os, schema=input.get("source_schema_file"))) raise ValueError("Mutate scripts can only be Python or R scripts (.py, .R).\n Instead we got {script} in \n [{sensor}][{device_os}] of {schema}".format(script=script, sensor=sensor, device_os=device_os, schema=input.get("source_schema_file")))
input["mutationscript"+str(idx)] = script input["mutationscript"+str(idx)] = script
return input return input
def input_tzcodes_file(wilcards):
from pathlib import Path
if config["TIMEZONE"]["TYPE"] == "MULTIPLE":
if not config["TIMEZONE"]["MULTIPLE"]["TZCODES_FILE"].lower().endswith(".csv"):
raise ValueError("[TIMEZONE][MULTIPLE][TZCODES_FILE] should point to a CSV file, instead you typed: " + config["TIMEZONE"]["MULTIPLE"]["TZCODES_FILE"])
if not Path(config["TIMEZONE"]["MULTIPLE"]["TZCODES_FILE"]).exists():
raise ValueError("[TIMEZONE][MULTIPLE][TZCODES_FILE] should point to a CSV file, the file in the path you typed does not exist: " + config["TIMEZONE"]["MULTIPLE"]["TZCODES_FILE"])
return [config["TIMEZONE"]["MULTIPLE"]["TZCODES_FILE"]]
return []

View File

@ -63,16 +63,19 @@ rule compute_time_segments:
rule phone_readable_datetime: rule phone_readable_datetime:
input: input:
sensor_input = "data/raw/{pid}/phone_{sensor}_raw.csv", sensor_input = "data/raw/{pid}/phone_{sensor}_raw.csv",
time_segments = "data/interim/time_segments/{pid}_time_segments.csv" time_segments = "data/interim/time_segments/{pid}_time_segments.csv",
pid_file = "data/external/participant_files/{pid}.yaml",
tzcodes_file = input_tzcodes_file,
params: params:
timezones = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["TYPE"], device_type = "phone",
fixed_timezone = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"], timezone_parameters = config["TIMEZONE"],
pid = "{pid}",
time_segments_type = config["TIME_SEGMENTS"]["TYPE"], time_segments_type = config["TIME_SEGMENTS"]["TYPE"],
include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
output: output:
"data/raw/{pid}/phone_{sensor}_with_datetime.csv" "data/raw/{pid}/phone_{sensor}_with_datetime.csv"
script: script:
"../src/data/readable_datetime.R" "../src/data/datetime/readable_datetime.R"
rule phone_yielded_timestamps: rule phone_yielded_timestamps:
input: input:
@ -87,16 +90,19 @@ rule phone_yielded_timestamps:
rule phone_yielded_timestamps_with_datetime: rule phone_yielded_timestamps_with_datetime:
input: input:
sensor_input = "data/interim/{pid}/phone_yielded_timestamps.csv", sensor_input = "data/interim/{pid}/phone_yielded_timestamps.csv",
time_segments = "data/interim/time_segments/{pid}_time_segments.csv" time_segments = "data/interim/time_segments/{pid}_time_segments.csv",
pid_file = "data/external/participant_files/{pid}.yaml",
tzcodes_file = input_tzcodes_file,
params: params:
timezones = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["TYPE"], device_type = "phone",
fixed_timezone = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"], timezone_parameters = config["TIMEZONE"],
pid = "{pid}",
time_segments_type = config["TIME_SEGMENTS"]["TYPE"], time_segments_type = config["TIME_SEGMENTS"]["TYPE"],
include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
output: output:
"data/interim/{pid}/phone_yielded_timestamps_with_datetime.csv" "data/interim/{pid}/phone_yielded_timestamps_with_datetime.csv"
script: script:
"../src/data/readable_datetime.R" "../src/data/datetime/readable_datetime.R"
rule unify_ios_android: rule unify_ios_android:
input: input:
@ -125,16 +131,19 @@ rule process_phone_locations_types:
rule phone_locations_processed_with_datetime: rule phone_locations_processed_with_datetime:
input: input:
sensor_input = "data/interim/{pid}/phone_locations_processed.csv", sensor_input = "data/interim/{pid}/phone_locations_processed.csv",
time_segments = "data/interim/time_segments/{pid}_time_segments.csv" time_segments = "data/interim/time_segments/{pid}_time_segments.csv",
pid_file = "data/external/participant_files/{pid}.yaml",
tzcodes_file = input_tzcodes_file,
params: params:
timezones = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["TYPE"], device_type = "phone",
fixed_timezone = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"], timezone_parameters = config["TIMEZONE"],
pid = "{pid}",
time_segments_type = config["TIME_SEGMENTS"]["TYPE"], time_segments_type = config["TIME_SEGMENTS"]["TYPE"],
include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
output: output:
"data/interim/{pid}/phone_locations_processed_with_datetime.csv" "data/interim/{pid}/phone_locations_processed_with_datetime.csv"
script: script:
"../src/data/readable_datetime.R" "../src/data/datetime/readable_datetime.R"
rule phone_locations_processed_with_datetime_with_home: rule phone_locations_processed_with_datetime_with_home:
input: input:
@ -160,16 +169,20 @@ rule resample_episodes:
rule resample_episodes_with_datetime: rule resample_episodes_with_datetime:
input: input:
sensor_input = "data/interim/{pid}/{sensor}_episodes_resampled.csv", sensor_input = "data/interim/{pid}/{sensor}_episodes_resampled.csv",
time_segments = "data/interim/time_segments/{pid}_time_segments.csv" time_segments = "data/interim/time_segments/{pid}_time_segments.csv",
pid_file = "data/external/participant_files/{pid}.yaml",
tzcodes_file = input_tzcodes_file,
params: params:
timezones = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["TYPE"], device_type = lambda wildcards: wildcards.sensor.split("_")[0],
fixed_timezone = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"], timezone_parameters = config["TIMEZONE"],
pid = "{pid}",
time_segments_type = config["TIME_SEGMENTS"]["TYPE"], time_segments_type = config["TIME_SEGMENTS"]["TYPE"],
include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
output: output:
"data/interim/{pid}/{sensor}_episodes_resampled_with_datetime.csv" "data/interim/{pid}/{sensor}_episodes_resampled_with_datetime.csv"
script: script:
"../src/data/readable_datetime.R" "../src/data/datetime/readable_datetime.R"
rule phone_application_categories: rule phone_application_categories:
input: input:

View File

@ -0,0 +1,103 @@
library(tibble)
library(dplyr)
library(tidyr)
library(purrr)
library(yaml)
options(scipen = 999)
buils_tz_intervals <- function(tz_codes){
tz_codes <- tz_codes %>%
group_by(device_id) %>%
mutate(end_timestamp = lead(timestamp)) %>%
ungroup() %>%
replace_na(list(end_timestamp = as.numeric(Sys.time())*1000))
return(tz_codes)
}
filter_tz_per_device <- function(device_id, tz_codes, default, IF_MISSING_TZCODE){
device_tz_codes <- tz_codes %>% filter(device_id == !!device_id) %>% select(-device_id)
if(nrow(device_tz_codes) > 0)
return(device_tz_codes)
else if(IF_MISSING_TZCODE == "STOP")
stop(paste("The device id '", device_id, "' does not have any time zone codes in your [MULTIPLE][TZCODES_FILE], add one or set IF_MISSING_TZCODE to 'USE_DEFAULT'"))
else if(IF_MISSING_TZCODE == "USE_DEFAULT")
return(data.frame(timestamp = c(0), tzcode = default, end_timestamp = as.numeric(Sys.time())*1000))
stop("We should have obtained the time zones for a device, stop the execution or use the default tz but this didn't happen. Create an issue on Github")
}
assign_tz_code <- function(data, tz_codes){
data$local_timezone = NA_character_
for(i in 1:nrow(tz_codes)) {
start_timestamp <- tz_codes[[i, "timestamp"]]
end_timestamp <- tz_codes[[i, "end_timestamp"]]
time_zone <- trimws(tz_codes[[i, "tzcode"]], which="both")
data$local_timezone <- ifelse(start_timestamp <= data$timestamp & data$timestamp < end_timestamp, time_zone, data$local_timezone)
}
return(data %>% filter(!is.na(local_timezone)))
}
validate_single_tz_per_fitbit_device <- function(tz_codes, INFER_FROM_SMARTPHONE_TZ){
if(INFER_FROM_SMARTPHONE_TZ)
stop("If [TIMEZONE][MULTIPLE][FITBIT][INFER_FROM_SMARTPHONE_TZ] is True (you want to infer Fitbit time zones with smartphone data), you need to set ALLOW_MULTIPLE_TZ_PER_DEVICE to True. However, read the docs to understand why this can be innacurate")
tz_per_device <- tz_codes %>% group_by(device_id) %>% summarise(n = n(), .groups = "drop_last") %>% filter(n > 1)
if(nrow(tz_per_device) > 0)
stop(paste("The following Fitbit device ids have more than one time zone change which is not allowed if [TIMEZONE][MULTIPLE][FITBIT][ALLOW_MULTIPLE_TZ_PER_DEVICE] is False:", paste(tz_per_device %>% pull(device_id), collapse = ",")))
zero_ts <- tz_codes %>% filter(timestamp > 0)
if(nrow(zero_ts) > 0)
stop(paste("The following Fitbit device ids have a time zone change with a timestamp bigger than 0 which is not allowed if [TIMEZONE][MULTIPLE][FITBIT][ALLOW_MULTIPLE_TZ_PER_DEVICE] is False: ", paste(zero_ts %>% pull(device_id), collapse = ",")))
}
validate_devies_exist_in_participant_file <- function(devices, device_type, pid, participant_file){
if(length(devices) == 0)
stop("[TIMEZONE][MULTIPLE][FITBIT][ALLOW_MULTIPLE_TZ_PER_DEVICE] is True (you want to infer Fitbit time zones with smartphone data), however participant ", pid," does not have any [",device_type,"][DEVICE_IDS] in ", participant_file)
}
# TODO include CSV timezone file in rule
multiple_time_zone_assignment <- function(data, timezone_parameters, device_type, pid, participant_file){
tz_codes <- read.csv(timezone_parameters$MULTIPLE$TZCODES_FILE)
default <- timezone_parameters$MULTIPLE$DEFAULT_TZCODE
IF_MISSING_TZCODE <- timezone_parameters$MULTIPLE$IF_MISSING_TZCODE
ALLOW_MULTIPLE_TZ_PER_DEVICE <- timezone_parameters$MULTIPLE$FITBIT$ALLOW_MULTIPLE_TZ_PER_DEVICE
INFER_FROM_SMARTPHONE_TZ <- timezone_parameters$MULTIPLE$FITBIT$INFER_FROM_SMARTPHONE_TZ
participant_data <- read_yaml(participant_file)
phone_ids <- participant_data$PHONE$DEVICE_IDS
fitbit_ids <- participant_data$FITBIT$DEVICE_IDS
if(device_type == "empatica")
data$device_id = pid
else if(device_type == "fitbit"){
if(!ALLOW_MULTIPLE_TZ_PER_DEVICE){
validate_single_tz_per_fitbit_device(tz_codes, INFER_FROM_SMARTPHONE_TZ)
} else if(INFER_FROM_SMARTPHONE_TZ){
validate_devies_exist_in_participant_file(phone_ids, "PHONE", pid, participant_file)
validate_devies_exist_in_participant_file(fitbit_ids, "FITBIT", pid, participant_file)
unified_device_id <- paste0("unified_device_id", pid)
data <- data %>% mutate(device_id = if_else(device_id %in% phone_ids, unified_device_id, device_id))
tz_codes <- tz_codes %>% mutate(device_id = if_else(device_id %in% fitbit_ids, unified_device_id, device_id))
}
}
tz_intervals <- buils_tz_intervals(tz_codes)
data <- data %>%
group_by(device_id) %>%
nest() %>%
mutate(tz_codes_per_device = map(device_id, filter_tz_per_device, tz_intervals, default, IF_MISSING_TZCODE)) %>%
mutate(data = map2(data, tz_codes_per_device, assign_tz_code )) %>%
select(-tz_codes_per_device) %>%
unnest(cols = data)
return(data)
}

View File

@ -2,7 +2,7 @@ library("tidyverse")
library("lubridate", warn.conflicts = F) library("lubridate", warn.conflicts = F)
options(scipen=999) options(scipen=999)
day_type_delay <- function(day_type, include_past_periodic_segments){ day_type_delay <- function(time_segments, day_type, include_past_periodic_segments){
delay <- time_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == day_type) %>% arrange(-length_duration) %>% pull(length_duration) %>% first() delay <- time_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == day_type) %>% arrange(-length_duration) %>% pull(length_duration) %>% first()
return(if_else(is.na(delay) | include_past_periodic_segments == FALSE, duration("0days"), delay)) return(if_else(is.na(delay) | include_past_periodic_segments == FALSE, duration("0days"), delay))
} }
@ -90,10 +90,10 @@ assign_to_time_segment <- function(sensor_data, time_segments, time_segments_typ
# We need to take into account segment start dates that could include the first day of data # We need to take into account segment start dates that could include the first day of data
time_segments <- time_segments %>% mutate(length_duration = duration(length)) time_segments <- time_segments %>% mutate(length_duration = duration(length))
every_day_delay <- duration("0days") every_day_delay <- duration("0days")
wday_delay <- day_type_delay("wday", include_past_periodic_segments) wday_delay <- day_type_delay(time_segments, "wday", include_past_periodic_segments)
mday_delay <- day_type_delay("mday", include_past_periodic_segments) mday_delay <- day_type_delay(time_segments, "mday", include_past_periodic_segments)
qday_delay <- day_type_delay("qday", include_past_periodic_segments) qday_delay <- day_type_delay(time_segments, "qday", include_past_periodic_segments)
yday_delay <- day_type_delay("yday", include_past_periodic_segments) yday_delay <- day_type_delay(time_segments, "yday", include_past_periodic_segments)
sensor_data <- sensor_data %>% sensor_data <- sensor_data %>%
group_by(local_timezone) %>% group_by(local_timezone) %>%

View File

@ -0,0 +1,119 @@
source("renv/activate.R")
library("tidyverse")
library("readr")
library("tidyr")
library("lubridate")
library("yaml")
source("src/data/datetime/assign_to_time_segment.R")
source("src/data/datetime/assign_to_multiple_timezones.R")
split_local_date_time <- function(data){
data <- data %>%
separate(local_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>%
separate(local_time, c("local_hour", "local_minute"), ":", remove = FALSE, extra = "drop") %>%
mutate(local_hour = as.numeric(local_hour),
local_minute = as.numeric(local_minute))
return(data)
}
is_valid_timezone <- function(timezone) {
return(timezone %in% (OlsonNames()))
}
validate_user_timezones <- function(timezone_parameters){
if(!timezone_parameters$TYPE %in% c("SINGLE", "MULTIPLE"))
stop("Invalid [TIMEZONE][TYPE], only valid options are SINGLE or MULTIPLE")
if(timezone_parameters$TYPE == "SINGLE"){
if(!is_valid_timezone(timezone_parameters$SINGLE$TZCODE))
stop(paste("[TIMEZONE][SINGLE][TZCODE] is not a valid timezone: ", timezone_parameters$SINGLE$TZCODE))
} else if(timezone_parameters$TYPE == "MULTIPLE"){
tz_codes <- read.csv(timezone_parameters$MULTIPLE$TZCODES_FILE)
valid_file_columns <- c("device_id", "timestamp", "tzcode")
if(length(colnames(tz_codes)) != length(valid_file_columns) || !setequal(colnames(tz_codes), valid_file_columns))
stop(paste("[TIMEZONE][MULTIPLE][TZCODES_FILE] has does not have the required columns. You provided",paste(colnames(tz_codes), collapse=","),"but we need",paste(valid_file_columns, collapse=",")))
invalid_tz_codes <- tz_codes %>%
mutate(row = (1:n()) + 1,
tzcode = trimws(tzcode, which="both"),
is_valid = is_valid_timezone(tzcode)) %>%
filter(is_valid == FALSE)
if(nrow(invalid_tz_codes) > 0)
stop(paste("[TIMEZONE][MULTIPLE][TZCODES_FILE] has invalid time zone codes. In file ", timezone_parameters$MULTIPLE$TZCODES_FILE, ".\nAffected rows=[", paste(invalid_tz_codes %>% pull(row),collapse=","), "], with invalid codes=[", paste(invalid_tz_codes %>% pull(tzcode),collapse=",") ,"]"))
}
}
create_mising_temporal_column <- function(data, device_type){
if(device_type == "fitbit"){
# For fibit we infere timestamp from Fitbit's local date time
return(data %>%
group_by(local_timezone) %>%
nest() %>%
mutate(data = map2(data, local_timezone, function(nested_data, tz){
return(nested_data %>% mutate(timestamp = as.numeric(ymd_hms(local_date_time, tz=tz)) * 1000) %>% drop_na(timestamp))
})) %>%
unnest(cols = everything()))
} else {
# For the rest of devices we infere local date time from timestamp
return(data %>%
group_by(local_timezone) %>%
nest() %>%
mutate(data = map2(data, local_timezone, function(nested_data, tz){
return(nested_data %>% mutate(local_date_time = format(as_datetime(timestamp / 1000, tz=tz), format="%Y-%m-%d %H:%M:%S")) %>% drop_na(local_date_time) )
})) %>%
unnest(cols = everything()))
}
}
filter_wanted_dates <- function(output, participant_file, device_type){
participant_data <- read_yaml(participant_file)
device_type <- toupper(device_type)
start_date <- participant_data[[device_type]]$START_DATE
end_date <- participant_data[[device_type]]$END_DATE
if(!is.null(start_date)){
start_date <- parse_date_time(start_date, orders = c("ymd", "ymdhMs", "ymdhM", "ymdh"))
if(is.na(start_date))
stop(paste0("[",device_type, "][START_DATE] does not have one of these valid formats: [ymd, ymd hms, ymd hm, ymd h], you typed: '", participant_data[[device_type]]$START_DATE, "' in ", participant_file))
output <- output %>% filter(ymd_hms(local_date_time) >= start_date)
}
if(!is.null(end_date)){
end_date <- parse_date_time(end_date, orders = c("ymd", "ymdhMs", "ymdhM", "ymdh"))
if(is.na(end_date))
stop(paste0("[",device_type, "][END_DATE] does not have one of these valid formats: [ymd, ymd hms, ymd hm, ymd h], you typed: '", participant_data[[device_type]]$END_DATE, "' in ", participant_file))
output <- output %>% filter(ymd_hms(local_date_time) <= end_date)
}
return(output)
}
readable_datetime <- function(){
input <- read.csv(snakemake@input[["sensor_input"]]) %>% arrange(timestamp)
time_segments <- read.csv(snakemake@input[["time_segments"]])
participant_file <- snakemake@input[["pid_file"]]
device_type <- snakemake@params[["device_type"]]
timezone_parameters <- snakemake@params[["timezone_parameters"]]
pid <- snakemake@params[["pid"]]
time_segments_type <- snakemake@params[["time_segments_type"]]
include_past_periodic_segments <- snakemake@params[["include_past_periodic_segments"]]
validate_user_timezones(timezone_parameters)
if(timezone_parameters$TYPE == "SINGLE")
output <- input %>% mutate(local_timezone = timezone_parameters$SINGLE$TZCODE)
else if(timezone_parameters$TYPE == "MULTIPLE")
output <- multiple_time_zone_assignment(input, timezone_parameters, device_type, pid, participant_file)
output <- create_mising_temporal_column(output, device_type)
output <- split_local_date_time(output)
output <- assign_to_time_segment(output, time_segments, time_segments_type, include_past_periodic_segments)
output <- filter_wanted_dates(output, participant_file, device_type)
write_csv(output, snakemake@output[[1]])
}
readable_datetime()

View File

@ -3,11 +3,12 @@ library("dplyr", warn.conflicts = F)
library(readr) library(readr)
library(tidyr) library(tidyr)
library(purrr) library(purrr)
options(scipen=999)
all_sensors = snakemake@input[["all_sensors"]] all_sensors = snakemake@input[["all_sensors"]]
sensor_timestamps <- tibble(files = all_sensors) %>% sensor_timestamps <- tibble(files = all_sensors) %>%
mutate(timestamps = map(files,~ read_csv(.,col_types = cols_only(timestamp = col_double()))), mutate(timestamps = map(files,~ read_csv(.,col_types = cols_only(timestamp = col_double(), device_id = col_character()))),
sensor = row_number(), sensor = row_number(),
files = NULL) %>% files = NULL) %>%
unnest(timestamps) %>% unnest(timestamps) %>%

View File

@ -51,7 +51,7 @@ if(locations_to_use == "ALL"){
# Filter those rows that are further away than time_since_valid_location since the last fused location # Filter those rows that are further away than time_since_valid_location since the last fused location
mutate(time_from_fused = timestamp - first(timestamp)) %>% mutate(time_from_fused = timestamp - first(timestamp)) %>%
filter(provider %in% providers_to_keep | (time_from_fused < (1000 * 60 * time_since_valid_location))) %>% filter(provider %in% providers_to_keep | (time_from_fused < (1000 * 60 * time_since_valid_location))) %>%
select(-consecutive_time_diff, -time_from_fused, -device_id) %>% select(-consecutive_time_diff, -time_from_fused) %>%
# Summarise the period to resample for # Summarise the period to resample for
summarise(across(timestamp, max, .names = "limit"), across(everything(), first)) %>% summarise(across(timestamp, max, .names = "limit"), across(everything(), first)) %>%
# the limit will be equal to the next timestamp-1 or the last binded timestamp (limit) plus the consecutive_threshold buffer # the limit will be equal to the next timestamp-1 or the last binded timestamp (limit) plus the consecutive_threshold buffer

View File

@ -1,49 +0,0 @@
source("renv/activate.R")
library("tidyverse")
library("readr")
source("src/data/assign_to_time_segment.R")
input <- read.csv(snakemake@input[["sensor_input"]]) %>% arrange(timestamp)
time_segments <- read.csv(snakemake@input[["time_segments"]])
time_segments_type <- snakemake@params[["time_segments_type"]]
sensor_output <- snakemake@output[[1]]
timezone_periods <- snakemake@params[["timezone_periods"]]
fixed_timezone <- snakemake@params[["fixed_timezone"]]
include_past_periodic_segments <- snakemake@params[["include_past_periodic_segments"]]
split_local_date_time <- function(data, time_segments){
split_data <- data %>%
separate(local_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>%
separate(local_time, c("local_hour", "local_minute"), ":", remove = FALSE, extra = "drop") %>%
mutate(local_hour = as.numeric(local_hour),
local_minute = as.numeric(local_minute))
return(split_data)
}
if(!is.null(timezone_periods)){
# TODO: Not active yet
# timezones <- read_csv(timezone_periods)
# tz_starts <- timezones$start
# output <- input %>%
# mutate(timezone = findInterval(timestamp / 1000, tz_starts), # Set an interval ID based on timezones' start column
# timezone = ifelse(timezone == 0, 1, timezone), # Correct the first timezone ID
# timezone = recode(timezone, !!! timezones$timezone), # Swap IDs for text labels
# timezone = as.character(timezone)) %>%
# rowwise() %>%
# mutate(utc_date_time = as.POSIXct(timestamp/1000, origin="1970-01-01", tz="UTC"),
# local_date_time = format(utc_date_time, tz = timezone, usetz = T, "%Y-%m-%d %H:%M:%S"))
# output <- split_local_date_time(output, time_segments)
# TODO: Implement time segment assigment with support for multiple timezones
# output <- assign_to_time_segment(output, time_segments, time_segments_type, fixed_timezone)
# write.csv(output, sensor_output)
} else if(!is.null(fixed_timezone)){
output <- input %>%
mutate(utc_date_time = as.POSIXct(timestamp/1000, origin="1970-01-01", tz="UTC"),
local_timezone = fixed_timezone,
local_date_time = format(utc_date_time, tz = fixed_timezone, "%Y-%m-%d %H:%M:%S"))
output <- split_local_date_time(output, time_segments)
output <- assign_to_time_segment(output, time_segments, time_segments_type, include_past_periodic_segments)
write_csv(output, sensor_output)
}

View File

@ -16,10 +16,11 @@ if(nrow(activity_recognition) > 0){
type_diff = c(1, diff(activity_type)), type_diff = c(1, diff(activity_type)),
episode_id = cumsum(type_diff != 0 | time_diff > (episode_threshold_between_rows))) %>% episode_id = cumsum(type_diff != 0 | time_diff > (episode_threshold_between_rows))) %>%
group_by(episode_id) %>% group_by(episode_id) %>%
summarise(activity_name = first(activity_name), activity_type = first(activity_type), start_timestamp=first(start_timestamp), end_timestamp = last(end_timestamp)) summarise(device_id = first(device_id), activity_name = first(activity_name), activity_type = first(activity_type), start_timestamp=first(start_timestamp), end_timestamp = last(end_timestamp))
} else { } else {
ar_episodes <- data.frame(start_timestamp = numeric(), ar_episodes <- data.frame(device_id = character(),
start_timestamp = numeric(),
end_timestamp = numeric(), end_timestamp = numeric(),
episode_id = numeric(), episode_id = numeric(),
activity_type = numeric(), activity_type = numeric(),

View File

@ -19,9 +19,10 @@ if(nrow(battery) > 0){
status_diff = c(1, diff(battery_status)), status_diff = c(1, diff(battery_status)),
episode_id = cumsum(level_diff != 0 | status_diff != 0 | time_diff > (episode_threshold_between_rows))) %>% episode_id = cumsum(level_diff != 0 | status_diff != 0 | time_diff > (episode_threshold_between_rows))) %>%
group_by(episode_id) %>% group_by(episode_id) %>%
summarise(battery_level = first(battery_level), battery_status = first(battery_status), start_timestamp=first(start_timestamp), end_timestamp = last(end_timestamp)) summarise(device_id = first(device_id), battery_level = first(battery_level), battery_status = first(battery_status), start_timestamp=first(start_timestamp), end_timestamp = last(end_timestamp))
} else { } else {
battery_episodes <- data.frame(episode_id = numeric(), battery_episodes <- data.frame(device_id = character(),
episode_id = numeric(),
start_timestamp = numeric(), start_timestamp = numeric(),
end_timestamp = character(), end_timestamp = character(),
battery_level = character(), battery_level = character(),

View File

@ -34,7 +34,6 @@ barnett_features <- function(sensor_data_files, time_segment, params){
location <- location_data location <- location_data
accuracy_limit <- params[["ACCURACY_LIMIT"]] accuracy_limit <- params[["ACCURACY_LIMIT"]]
timezone <- params[["TIMEZONE"]]
minutes_data_used <- params[["MINUTES_DATA_USED"]] minutes_data_used <- params[["MINUTES_DATA_USED"]]
# Compute what features were requested # Compute what features were requested
@ -74,8 +73,10 @@ barnett_features <- function(sensor_data_files, time_segment, params){
location_dates_segments <- location %>% select(local_date, local_segment) %>% distinct(local_date, .keep_all = TRUE) location_dates_segments <- location %>% select(local_date, local_segment) %>% distinct(local_date, .keep_all = TRUE)
# Select only the columns that the algorithm needs # Select only the columns that the algorithm needs
all_timezones <- table(location %>% pull(local_timezone))
location <- location %>% select(timestamp, latitude = double_latitude, longitude = double_longitude, altitude = double_altitude, accuracy) location <- location %>% select(timestamp, latitude = double_latitude, longitude = double_longitude, altitude = double_altitude, accuracy)
if(nrow(location %>% filter(accuracy < accuracy_limit)) > 1){ if(nrow(location %>% filter(accuracy < accuracy_limit)) > 1){
timezone <- names(all_timezones)[as.vector(all_timezones)==max(all_timezones)]
outputMobility <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone) outputMobility <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone)
} else { } else {
print(paste("Cannot compute Barnett location features because there are no rows with an accuracy value lower than ACCURACY_LIMIT", accuracy_limit)) print(paste("Cannot compute Barnett location features because there are no rows with an accuracy value lower than ACCURACY_LIMIT", accuracy_limit))

View File

@ -43,6 +43,7 @@ get_screen_episodes <- function(screen){
# Only keep consecutive 3,0 pairs (UNLOCK, OFF) # Only keep consecutive 3,0 pairs (UNLOCK, OFF)
filter( (screen_status == 3 & lead(screen_status) == 0) | (screen_status == 0 & lag(screen_status) == 3) ) %>% filter( (screen_status == 3 & lead(screen_status) == 0) | (screen_status == 0 & lag(screen_status) == 3) ) %>%
summarise(episode = "unlock", summarise(episode = "unlock",
device_id = first(device_id),
screen_sequence = toString(screen_status), screen_sequence = toString(screen_status),
start_timestamp = first(timestamp), start_timestamp = first(timestamp),
end_timestamp = last(timestamp)) %>% end_timestamp = last(timestamp)) %>%
@ -55,7 +56,8 @@ get_screen_episodes <- function(screen){
} }
if(nrow(screen) < 2){ if(nrow(screen) < 2){
episodes <- data.frame(episode = character(), episodes <- data.frame(device_id = character(),,
episode = character(),
screen_sequence = character(), screen_sequence = character(),
start_timestamp = character(), start_timestamp = character(),
end_timestamp = character()) end_timestamp = character())

View File

@ -15,7 +15,7 @@ filter_data_by_segment <- function(data, time_segment){
} }
chunk_episodes <- function(sensor_episodes){ chunk_episodes <- function(sensor_episodes){
columns_to_drop <- c("^timestamp$", "utc_date_time", "local_date_time", "local_date", "local_time", "local_hour", "local_minute", "segment_start", "segment_end" ) columns_to_drop <- c("^timestamp$","local_date_time", "local_date", "local_time", "local_hour", "local_minute", "segment_start", "segment_end" )
chunked_episodes <- sensor_episodes %>% chunked_episodes <- sensor_episodes %>%
separate(col = timestamps_segment, separate(col = timestamps_segment,

View File

@ -38,7 +38,7 @@ def chunk_episodes(sensor_episodes):
sensor_episodes = sensor_episodes.drop_duplicates(subset=["start_timestamp", "end_timestamp", "local_segment"], keep="first") sensor_episodes = sensor_episodes.drop_duplicates(subset=["start_timestamp", "end_timestamp", "local_segment"], keep="first")
# Delete useless columns # Delete useless columns
for drop_col in ["utc_date_time", "local_date_time", "local_date", "local_time", "local_hour", "local_minute"]: for drop_col in ["local_date_time", "local_date", "local_time", "local_hour", "local_minute"]:
del sensor_episodes[drop_col] del sensor_episodes[drop_col]
# Avoid SettingWithCopyWarning # Avoid SettingWithCopyWarning

View File

@ -117,7 +117,38 @@ properties:
type: string type: string
TIMEZONE: TIMEZONE:
type: object
required: [TYPE, SINGLE, MULTIPLE]
properties:
TYPE:
type: string type: string
enum: ["MULTIPLE", "SINGLE"]
SINGLE:
type: object
required: [TZCODE]
properties:
TZCODE:
type: string
MULTIPLE:
type: object
required: [TZCODES_FILE, IF_MISSING_TZCODE, DEFAULT_TZCODE, FITBIT]
properties:
TZCODES_FILE:
type: string
pattern: "^.*\\.csv$"
IF_MISSING_TZCODE:
type: string
enum: [USE_DEFAULT]
DEFAULT_TZCODE:
type: string
FITBIT:
type: object
required: [ALLOW_MULTIPLE_TZ_PER_DEVICE, INFER_FROM_SMARTPHONE_TZ]
properties:
ALLOW_MULTIPLE_TZ_PER_DEVICE:
type: boolean
INFER_FROM_SMARTPHONE_TZ:
type: boolean
PIDS: PIDS:
type: array type: array
@ -666,8 +697,9 @@ properties:
ACCURACY_LIMIT: ACCURACY_LIMIT:
type: integer type: integer
exclusiveMinimum: 0 exclusiveMinimum: 0
TIMEZONE: IF_MULTIPLE_TIMEZONES:
type: string type: string
enum: [USE_MOST_COMMON]
MINUTES_DATA_USED: MINUTES_DATA_USED:
type: boolean type: boolean
additionalProperties: additionalProperties: