From 0d6f51be8b06a1e5e4d9d8296e3fc5c1f49fdd82 Mon Sep 17 00:00:00 2001 From: Meng Li <34143965+Meng6@users.noreply.github.com> Date: Fri, 9 Apr 2021 12:05:25 -0400 Subject: [PATCH] Refactor location features from Doryab provider & add a new strategy to infer home location & fix bugs --- Snakefile | 3 +- config.yaml | 18 +- docs/change-log.md | 2 + docs/citation.md | 5 +- docs/developers/test-cases.md | 9 + docs/features/phone-locations.md | 44 +- example_profile/example_config.yaml | 16 +- rules/common.smk | 6 + rules/features.smk | 20 +- rules/preprocessing.smk | 13 - src/data/infer_home_location.py | 140 ----- .../doryab/add_doryab_extra_columns.py | 138 +++++ .../doryab/doryab_clustering.py | 77 +++ src/features/phone_locations/doryab/main.py | 519 +++++------------- .../aware_csv/phone_locations_raw.csv | 53 ++ .../manual/aware_csv/phone_locations_raw.csv | 29 + .../mtz_event/android/phone_locations.csv | 3 + .../mtz_event/empatica/phone_locations.csv | 1 + .../mtz_event/empty/phone_locations.csv | 1 + .../mtz_event/fitbit/phone_locations.csv | 1 + .../mtz_event/ios/phone_locations.csv | 1 + .../mtz_frequency/android/phone_locations.csv | 8 + .../empatica/phone_locations.csv | 1 + .../mtz_frequency/empty/phone_locations.csv | 1 + .../mtz_frequency/fitbit/phone_locations.csv | 1 + .../mtz_frequency/ios/phone_locations.csv | 1 + .../mtz_periodic/android/phone_locations.csv | 16 + .../mtz_periodic/empatica/phone_locations.csv | 1 + .../mtz_periodic/empty/phone_locations.csv | 1 + .../mtz_periodic/fitbit/phone_locations.csv | 1 + .../mtz_periodic/ios/phone_locations.csv | 1 + .../stz_event/android/phone_locations.csv | 3 + .../stz_event/empatica/phone_locations.csv | 1 + .../stz_event/empty/phone_locations.csv | 1 + .../stz_event/fitbit/phone_locations.csv | 1 + .../stz_event/ios/phone_locations.csv | 1 + .../stz_frequency/android/phone_locations.csv | 9 + .../empatica/phone_locations.csv | 1 + .../stz_frequency/empty/phone_locations.csv | 1 + .../stz_frequency/fitbit/phone_locations.csv | 1 + .../stz_frequency/ios/phone_locations.csv | 1 + .../stz_periodic/android/phone_locations.csv | 16 + .../stz_periodic/empatica/phone_locations.csv | 1 + .../stz_periodic/empty/phone_locations.csv | 1 + .../stz_periodic/fitbit/phone_locations.csv | 1 + .../stz_periodic/ios/phone_locations.csv | 1 + tests/settings/mtz_event_config.yaml | 28 +- tests/settings/mtz_frequency_config.yaml | 28 +- tests/settings/mtz_periodic_config.yaml | 28 +- tests/settings/stz_event_config.yaml | 28 +- tests/settings/stz_frequency_config.yaml | 28 +- tests/settings/stz_periodic_config.yaml | 28 +- tools/config.schema.yaml | 32 +- 53 files changed, 669 insertions(+), 702 deletions(-) delete mode 100644 src/data/infer_home_location.py create mode 100644 src/features/phone_locations/doryab/add_doryab_extra_columns.py create mode 100644 src/features/phone_locations/doryab/doryab_clustering.py create mode 100644 tests/data/external/aware_csv/phone_locations_raw.csv create mode 100644 tests/data/manual/aware_csv/phone_locations_raw.csv create mode 100644 tests/data/processed/features/mtz_event/android/phone_locations.csv create mode 100644 tests/data/processed/features/mtz_event/empatica/phone_locations.csv create mode 100644 tests/data/processed/features/mtz_event/empty/phone_locations.csv create mode 100644 tests/data/processed/features/mtz_event/fitbit/phone_locations.csv create mode 100644 tests/data/processed/features/mtz_event/ios/phone_locations.csv create mode 100644 tests/data/processed/features/mtz_frequency/android/phone_locations.csv create mode 100644 tests/data/processed/features/mtz_frequency/empatica/phone_locations.csv create mode 100644 tests/data/processed/features/mtz_frequency/empty/phone_locations.csv create mode 100644 tests/data/processed/features/mtz_frequency/fitbit/phone_locations.csv create mode 100644 tests/data/processed/features/mtz_frequency/ios/phone_locations.csv create mode 100644 tests/data/processed/features/mtz_periodic/android/phone_locations.csv create mode 100644 tests/data/processed/features/mtz_periodic/empatica/phone_locations.csv create mode 100644 tests/data/processed/features/mtz_periodic/empty/phone_locations.csv create mode 100644 tests/data/processed/features/mtz_periodic/fitbit/phone_locations.csv create mode 100644 tests/data/processed/features/mtz_periodic/ios/phone_locations.csv create mode 100644 tests/data/processed/features/stz_event/android/phone_locations.csv create mode 100644 tests/data/processed/features/stz_event/empatica/phone_locations.csv create mode 100644 tests/data/processed/features/stz_event/empty/phone_locations.csv create mode 100644 tests/data/processed/features/stz_event/fitbit/phone_locations.csv create mode 100644 tests/data/processed/features/stz_event/ios/phone_locations.csv create mode 100644 tests/data/processed/features/stz_frequency/android/phone_locations.csv create mode 100644 tests/data/processed/features/stz_frequency/empatica/phone_locations.csv create mode 100644 tests/data/processed/features/stz_frequency/empty/phone_locations.csv create mode 100644 tests/data/processed/features/stz_frequency/fitbit/phone_locations.csv create mode 100644 tests/data/processed/features/stz_frequency/ios/phone_locations.csv create mode 100644 tests/data/processed/features/stz_periodic/android/phone_locations.csv create mode 100644 tests/data/processed/features/stz_periodic/empatica/phone_locations.csv create mode 100644 tests/data/processed/features/stz_periodic/empty/phone_locations.csv create mode 100644 tests/data/processed/features/stz_periodic/fitbit/phone_locations.csv create mode 100644 tests/data/processed/features/stz_periodic/ios/phone_locations.csv diff --git a/Snakefile b/Snakefile index be9e1690..aa5baa42 100644 --- a/Snakefile +++ b/Snakefile @@ -207,11 +207,12 @@ for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys(): if provider == "BARNETT": files_to_compute.extend(expand("data/interim/{pid}/phone_locations_barnett_daily.csv", pid=config["PIDS"])) + if provider == "DORYAB": + files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime_with_doryab_columns.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/raw/{pid}/phone_locations_raw.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime_with_home.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_locations_features/phone_locations_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_locations.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) diff --git a/config.yaml b/config.yaml index 7a548078..eae81b29 100644 --- a/config.yaml +++ b/config.yaml @@ -232,26 +232,22 @@ PHONE_LOCATIONS: LOCATIONS_TO_USE: ALL_RESAMPLED # ALL, GPS, ALL_RESAMPLED, OR FUSED_RESAMPLED FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row - HOME_INFERENCE: - DBSCAN_EPS: 100 # meters - DBSCAN_MINSAMPLES: 5 - THRESHOLD_STATIC : 1 # km/h - CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS PROVIDERS: DORYAB: COMPUTE: False - FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome"] + FEATURES: ["locationvariance","loglocationvariance","totaldistance","avgspeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","avglengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome", "homelabel"] ACCURACY_LIMIT: 100 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius DBSCAN_EPS: 100 # meters DBSCAN_MINSAMPLES: 5 THRESHOLD_STATIC : 1 # km/h - MAXIMUM_ROW_GAP: 300 - MAXIMUM_ROW_DURATION: 60 + MAXIMUM_ROW_GAP: 300 # seconds MINUTES_DATA_USED: False - CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET,TIME_SEGMENT - CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS - RADIUS_FOR_HOME: 100 + CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET, TIME_SEGMENT, TIME_SEGMENT_INSTANCE + INFER_HOME_LOCATION_STRATEGY: DORYAB_STRATEGY # DORYAB_STRATEGY, SUN_LI_VEGA_STRATEGY + MINIMUM_DAYS_TO_DETECT_HOME_CHANGES: 3 + CLUSTERING_ALGORITHM: DBSCAN # DBSCAN, OPTICS + RADIUS_FOR_HOME: 100 SRC_SCRIPT: src/features/phone_locations/doryab/main.py BARNETT: diff --git a/docs/change-log.md b/docs/change-log.md index 7e68e20f..317506c8 100644 --- a/docs/change-log.md +++ b/docs/change-log.md @@ -6,6 +6,8 @@ - Add the `EXCLUDE_SLEEP` module for steps intraday features - Fix bug when no phone data yield is needed to process location data - Remove location rows with the same timestamp based on their accuracy +- Refactor location features from Doryab provider +- Add a new strategy to infer home location ## v1.2.0 - Sleep summary and intraday features are more consistent. - Add wake and bedtime features for sleep summary data. diff --git a/docs/citation.md b/docs/citation.md index 0f950695..e7877a9b 100644 --- a/docs/citation.md +++ b/docs/citation.md @@ -55,10 +55,13 @@ If you computed locations features using the provider `[PHONE_LOCATIONS][BARNETT ## Doryab (locations) -If you computed locations features using the provider `[PHONE_LOCATIONS][DORYAB]` cite [this paper](https://arxiv.org/abs/1812.10394) and [this paper](https://doi.org/10.1145/2750858.2805845) in addition to RAPIDS. +If you computed locations features using the provider `[PHONE_LOCATIONS][DORYAB]` cite [this paper](https://arxiv.org/abs/1812.10394) and [this paper](https://doi.org/10.1145/2750858.2805845) in addition to RAPIDS. In addition, if you used the `SUN_LI_VEGA_STRATEGY` strategy, cite [this paper](https://www.jmir.org/2020/9/e19992/) as well. !!! cite "Doryab et al. citation" Doryab, A., Chikarsel, P., Liu, X., & Dey, A. K. (2019). Extraction of Behavioral Features from Smartphone and Wearable Data. ArXiv:1812.10394 [Cs, Stat]. http://arxiv.org/abs/1812.10394 !!! cite "Canzian et al. citation" Luca Canzian and Mirco Musolesi. 2015. Trajectories of depression: unobtrusive monitoring of depressive states by means of smartphone mobility traces analysis. In Proceedings of the 2015 ACM International Joint Conference on Pervasive and Ubiquitous Computing (UbiComp '15). Association for Computing Machinery, New York, NY, USA, 1293–1304. DOI:https://doi.org/10.1145/2750858.2805845 + +!!! cite "Sun et al. citation" + Sun S, Folarin AA, Ranjan Y, Rashid Z, Conde P, Stewart C, Cummins N, Matcham F, Dalla Costa G, Simblett S, Leocani L, Lamers F, Sørensen PS, Buron M, Zabalza A, Guerrero Pérez AI, Penninx BW, Siddi S, Haro JM, Myin-Germeys I, Rintala A, Wykes T, Narayan VA, Comi G, Hotopf M, Dobson RJ, RADAR-CNS Consortium. Using Smartphones and Wearable Devices to Monitor Behavioral Changes During COVID-19. J Med Internet Res 2020;22(9):e19992 diff --git a/docs/developers/test-cases.md b/docs/developers/test-cases.md index a60852fe..ce7d7e2a 100644 --- a/docs/developers/test-cases.md +++ b/docs/developers/test-cases.md @@ -160,6 +160,15 @@ Due to the difference in the format of the raw battery data for iOS and Android that contains data for Android. All other files (i.e. for iPhone) are empty data files. +## Locations + +Description + +- The participant's home location is (latitude=1, longitude=1). +- From Sat 10:56:00 to Sat 11:04:00, the center of the cluster is (latitude=-100, longitude=-100). +- From Sun 03:30:00 to Sun 03:47:00, the center of the cluster is (latitude=1, longitude=1). Home location is extracted from this period. +- From Sun 11:30:00 to Sun 11:38:00, the center of the cluster is (latitude=100, longitude=100). + ## Application Foreground - The raw application foreground data file contains data for 1 day. diff --git a/docs/features/phone-locations.md b/docs/features/phone-locations.md index d46ce1ef..0989b1c7 100644 --- a/docs/features/phone-locations.md +++ b/docs/features/phone-locations.md @@ -111,7 +111,7 @@ These features are based on the original implementation by [Doryab et al.](../.. - data/raw/{pid}/phone_locations_raw.csv - data/interim/{pid}/phone_locations_processed.csv - data/interim/{pid}/phone_locations_processed_with_datetime.csv - - data/interim/{pid}/phone_locations_processed_with_datetime_with_home.csv + - data/interim/{pid}/phone_locations_processed_with_datetime_with_doryab_columns.csv - data/interim/{pid}/phone_locations_features/phone_locations_{language}_{provider_key}.csv - data/processed/features/{pid}/phone_locations.csv ``` @@ -127,11 +127,11 @@ Parameters description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`: | `[DBSCAN_EPS]` | The maximum distance in meters between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. | `[DBSCAN_MINSAMPLES]` | The number of samples (or total weight) in a neighborhood for a point to be considered as a core point of a cluster. This includes the point itself. | `[THRESHOLD_STATIC]` | It is the threshold value in km/hr which labels a row as Static or Moving. -| `[MAXIMUM_ROW_GAP]` | The maximum gap (in seconds) allowed between any two consecutive rows for them to be considered part of the same displacement. If this threshold is too high, it can throw speed and distance calculations off for periods when the phone was not sensing. -| `[MAXIMUM_ROW_DURATION]` | The time difference between any two consecutive rows `A` and `B` is considered as the time a participant spent in `A`. If this difference is bigger than MAXIMUM_ROW_GAP we substitute it with `MAXIMUM_ROW_DURATION`. +| `[MAXIMUM_ROW_GAP]` | The maximum gap (in seconds) allowed between any two consecutive rows for them to be considered part of the same displacement. If this threshold is too high, it can throw speed and distance calculations off for periods when the phone was not sensing. This value must be larger than your GPS sampling interval when `[LOCATIONS_TO_USE]` is `ALL` or `GPS`, otherwise all the stationary-related features will be NA. If `[LOCATIONS_TO_USE]` is `ALL_RESAMPLED` or `FUSED_RESAMPLED`, you can use the default value as every row will be resampled at 1-minute intervals. | `[MINUTES_DATA_USED]` | Set to `True` to include an extra column in the final location feature file containing the number of minutes used to compute the features on each time segment. Use this for quality control purposes; the more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough. -| `[SAMPLING_FREQUENCY]` | Expected time difference between any two location rows in minutes. If set to `0`, the sampling frequency will be inferred automatically as the median of all the differences between two consecutive row timestamps (recommended if you are using `FUSED_RESAMPLED` data). This parameter impacts all the time calculations. -| `[CLUSTER_ON]` | Set this flag to `PARTICIPANT_DATASET` to create clusters based on the entire participant's dataset or to `TIME_SEGMENT` to create clusters based on all the instances of the corresponding time segment (e.g. all mornings). +| `[CLUSTER_ON]` | Set this flag to `PARTICIPANT_DATASET` to create clusters based on the entire participant's dataset or to `TIME_SEGMENT` to create clusters based on all the instances of the corresponding time segment (e.g. all mornings) or to `TIME_SEGMENT_INSTANCE` to create clusters based on a single instance (e.g. 2020-05-20's morning). +|`[INFER_HOME_LOCATION_STRATEGY]` | The strategy applied to infer home locations. Set to `DORYAB_STRATEGY` to infer one home location for the entire dataset of each participant or to `SUN_LI_VEGA_STRATEGY` to infer one home location per day per participant. See Observations below to know more. +|`[MINIMUM_DAYS_TO_DETECT_HOME_CHANGES]` | The minimum number of consecutive days a new home location candidate has to repeat before it is considered the participant's new home. This parameter will be used only when `[INFER_HOME_LOCATION_STRATEGY]` is set to `SUN_LI_VEGA_STRATEGY`. | `[CLUSTERING_ALGORITHM]` | The original Doryab et al. implementation uses `DBSCAN`, `OPTICS` is also available with similar (but not identical) clustering results and lower memory consumption. | `[RADIUS_FOR_HOME]` | All location coordinates within this distance (meters) from the home location coordinates are considered a homestay (see `timeathome` feature). @@ -143,25 +143,25 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`: |locationvariance |$meters^2$ |The sum of the variances of the latitude and longitude columns. |loglocationvariance | - | Log of the sum of the variances of the latitude and longitude columns. |totaldistance |meters |Total distance traveled in a time segment using the haversine formula. -|averagespeed |km/hr |Average speed in a time segment considering only the instances labeled as Moving. +|avgspeed |km/hr |Average speed in a time segment considering only the instances labeled as Moving. |varspeed |km/hr |Speed variance in a time segment considering only the instances labeled as Moving. -|{--circadianmovement--} |- | Not suggested for use now; see Observations below. \ "It encodes the extent to which a person's location patterns follow a 24-hour circadian cycle.\" [Doryab et al.](../../citation#doryab-locations). +|{--circadianmovement--} |- | Deprecated, see Observations below. \ "It encodes the extent to which a person's location patterns follow a 24-hour circadian cycle.\" [Doryab et al.](../../citation#doryab-locations). |numberofsignificantplaces |places |Number of significant locations visited. It is calculated using the DBSCAN/OPTICS clustering algorithm which takes in EPS and MIN_SAMPLES as parameters to identify clusters. Each cluster is a significant place. |numberlocationtransitions |transitions |Number of movements between any two clusters in a time segment. |radiusgyration |meters |Quantifies the area covered by a participant |timeattop1location |minutes |Time spent at the most significant location. |timeattop2location |minutes |Time spent at the 2nd most significant location. |timeattop3location |minutes |Time spent at the 3rd most significant location. -|movingtostaticratio | - | Ratio between stationary time and total location sensed time. A lat/long coordinate pair is labeled as stationary if its speed (distance/time) to the next coordinate pair is less than 1km/hr. A higher value represents a more stationary routine. These times are computed using timeInSeconds feature. -|outlierstimepercent | - | Ratio between the time spent in non-significant clusters divided by the time spent in all clusters (total location sensed time). A higher value represents more time spent in non-significant clusters. These times are computed using timeInSeconds feature. +|movingtostaticratio | - | Ratio between stationary time and total location sensed time. A lat/long coordinate pair is labeled as stationary if its speed (distance/time) to the next coordinate pair is less than 1km/hr. A higher value represents a more stationary routine. +|outlierstimepercent | - | Ratio between the time spent in non-significant clusters divided by the time spent in all clusters (stationary time. Only stationary samples are clustered). A higher value represents more time spent in non-significant clusters. |maxlengthstayatclusters |minutes |Maximum time spent in a cluster (significant location). |minlengthstayatclusters |minutes |Minimum time spent in a cluster (significant location). -|meanlengthstayatclusters |minutes |Average time spent in a cluster (significant location). +|avglengthstayatclusters |minutes |Average time spent in a cluster (significant location). |stdlengthstayatclusters |minutes |Standard deviation of time spent in a cluster (significant location). |locationentropy |nats |Shannon Entropy computed over the row count of each cluster (significant location), it is higher the more rows belong to a cluster (i.e., the more time a participant spent at a significant location). |normalizedlocationentropy |nats |Shannon Entropy computed over the row count of each cluster (significant location) divided by the number of clusters; it is higher the more rows belong to a cluster (i.e., the more time a participant spent at a significant location). |timeathome |minutes | Time spent at home (see Observations below for a description on how we compute home). - +|homelabel |- | An integer that represents a different home location. It will be a constant number (1) for all participants when `[INFER_HOME_LOCATION_STRATEGY]` is set to `DORYAB_STRATEGY` or an incremental index if the strategy is set to `SUN_LI_VEGA_STRATEGY`. !!! note "Assumptions/Observations" **Significant Locations Identified** @@ -174,7 +174,25 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`: Based on an experiment where we collected fused location data for 7 days with a mean accuracy of 86 & SD of 350.874635, we determined that `EPS/MAX_EPS`=100 produced closer clustering results to reality. Higher values (>100) missed out on some significant places, like a short grocery visit, while lower values (<100) picked up traffic lights and stop signs while driving as significant locations. We recommend you set `EPS` based on your location data's accuracy (the more accurate your data is, the lower you should be able to set EPS). **Duration Calculation** - To calculate the time duration component for our features, we compute the difference between consecutive rows' timestamps to take into account sampling rate variability. If this time difference is larger than a threshold (300 seconds by default), we replace it with a maximum duration (60 seconds by default, i.e., we assume a participant spent at least 60 seconds in their last known location) + To calculate the time duration component for our features, we compute the difference between consecutive rows' timestamps to take into account sampling rate variability. If this time difference is larger than a threshold (300 seconds by default), we replace it with NA and label that row as Moving. **Home location** - Home is calculated using all location data of a participant between 12 am and 6 am, then applying a clustering algorithm (`DB_SCAN` or `OPTICS`) and considering the center of the biggest cluster home for that participant. + + - `DORYAB_STRATEGY`: home is calculated using all location data of a participant between 12 am and 6 am, then applying a clustering algorithm (`DBSCAN` or `OPTICS`) and considering the center of the biggest cluster home for that participant. + + - `SUN_LI_VEGA_STRATEGY`: home is calculated using all location data of a participant between 12 am and 6 am, then applying a clustering algorithm (`DBSCAN` or `OPTICS`). The following steps are used to infer the home location per day for that participant: + + 1. if there are records within [03:30:00, 04:30:00] for that night:
+     we choose the most common cluster during that period as a home candidate for that day.
+ elif there are records within [midnight, 03:30:00) for that night:
+     we choose the last valid cluster during that period as a home candidate for that day.
+ elif there are records within (04:30:00, 06:00:00] for that night:
+     we choose the first valid cluster during that period as a home candidate for that day.
+ else:
+     the home location is NA (missing) for that day. + + 2. If the count of consecutive days with the same candidate home location cluster label is larger or equal to `[MINIMUM_DAYS_TO_DETECT_HOME_CHANGES]`, + the candidate will be regarded as the home cluster; otherwise, the home cluster will be the last valid day's cluster. + If there are no valid clusters before that day, the first home location in the days after is used. + + diff --git a/example_profile/example_config.yaml b/example_profile/example_config.yaml index 33c6fa2e..be3deb29 100644 --- a/example_profile/example_config.yaml +++ b/example_profile/example_config.yaml @@ -215,25 +215,21 @@ PHONE_LOCATIONS: LOCATIONS_TO_USE: FUSED_RESAMPLED # ALL, GPS, ALL_RESAMPLED, OR FUSED_RESAMPLED FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row - HOME_INFERENCE: - DBSCAN_EPS: 10 # meters - DBSCAN_MINSAMPLES: 5 - THRESHOLD_STATIC : 1 # km/h - CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS PROVIDERS: DORYAB: COMPUTE: True - FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome"] + FEATURES: ["locationvariance","loglocationvariance","totaldistance","avgspeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","avglengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome"] ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius DBSCAN_EPS: 10 # meters DBSCAN_MINSAMPLES: 5 THRESHOLD_STATIC : 1 # km/h - MAXIMUM_ROW_GAP: 300 - MAXIMUM_ROW_DURATION: 60 + MAXIMUM_ROW_GAP: 300 # seconds MINUTES_DATA_USED: False - CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET,TIME_SEGMENT - CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS + CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET, TIME_SEGMENT, TIME_SEGMENT_INSTANCE + INFER_HOME_LOCATION_STRATEGY: DORYAB_STRATEGY # DORYAB_STRATEGY, SUN_LI_VEGA_STRATEGY + MINIMUM_DAYS_TO_DETECT_HOME_CHANGES: 3 + CLUSTERING_ALGORITHM: DBSCAN # DBSCAN, OPTICS RADIUS_FOR_HOME: 100 SRC_SCRIPT: src/features/phone_locations/doryab/main.py diff --git a/rules/common.smk b/rules/common.smk index da31700a..8ff4a191 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -21,6 +21,12 @@ def get_barnett_daily(wildcards): return "data/interim/{pid}/phone_locations_barnett_daily.csv" return [] +def get_locations_python_input(wildcards): + if wildcards.provider_key.upper() == "DORYAB": + return "data/interim/{pid}/phone_locations_processed_with_datetime_with_doryab_columns.csv" + else: + return "data/interim/{pid}/phone_locations_processed_with_datetime.csv" + def find_features_files(wildcards): feature_files = [] for provider_key, provider in config[(wildcards.sensor_key).upper()]["PROVIDERS"].items(): diff --git a/rules/features.smk b/rules/features.smk index 158c03e9..b4af563d 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -366,9 +366,27 @@ rule phone_light_r_features: script: "../src/features/entry.R" +rule phone_locations_add_doryab_extra_columns: + input: + sensor_input = "data/interim/{pid}/phone_locations_processed_with_datetime.csv", + params: + accuracy_limit = config["PHONE_LOCATIONS"]["PROVIDERS"]["DORYAB"]["ACCURACY_LIMIT"], + maximum_row_gap = config["PHONE_LOCATIONS"]["PROVIDERS"]["DORYAB"]["MAXIMUM_ROW_GAP"], + dbscan_eps = config["PHONE_LOCATIONS"]["PROVIDERS"]["DORYAB"]["DBSCAN_EPS"], + dbscan_minsamples = config["PHONE_LOCATIONS"]["PROVIDERS"]["DORYAB"]["DBSCAN_MINSAMPLES"], + threshold_static = config["PHONE_LOCATIONS"]["PROVIDERS"]["DORYAB"]["THRESHOLD_STATIC"], + clustering_algorithm = config["PHONE_LOCATIONS"]["PROVIDERS"]["DORYAB"]["CLUSTERING_ALGORITHM"], + cluster_on = config["PHONE_LOCATIONS"]["PROVIDERS"]["DORYAB"]["CLUSTER_ON"], + infer_home_location_strategy = config["PHONE_LOCATIONS"]["PROVIDERS"]["DORYAB"]["INFER_HOME_LOCATION_STRATEGY"], + minimum_days_to_detect_home_changes = config["PHONE_LOCATIONS"]["PROVIDERS"]["DORYAB"]["MINIMUM_DAYS_TO_DETECT_HOME_CHANGES"] + output: + "data/interim/{pid}/phone_locations_processed_with_datetime_with_doryab_columns.csv" + script: + "../src/features/phone_locations/doryab/add_doryab_extra_columns.py" + rule phone_locations_python_features: input: - sensor_data = "data/interim/{pid}/phone_locations_processed_with_datetime_with_home.csv", + sensor_data = get_locations_python_input, time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_LOCATIONS"]["PROVIDERS"][wildcards.provider_key.upper()], diff --git a/rules/preprocessing.smk b/rules/preprocessing.smk index 0d393463..14d295d1 100644 --- a/rules/preprocessing.smk +++ b/rules/preprocessing.smk @@ -121,19 +121,6 @@ rule phone_locations_processed_with_datetime: script: "../src/data/datetime/readable_datetime.R" -rule phone_locations_processed_with_datetime_with_home: - input: - sensor_input = "data/interim/{pid}/phone_locations_processed_with_datetime.csv" - params: - dbscan_eps = config["PHONE_LOCATIONS"]["HOME_INFERENCE"]["DBSCAN_EPS"], - dbscan_minsamples = config["PHONE_LOCATIONS"]["HOME_INFERENCE"]["DBSCAN_MINSAMPLES"], - threshold_static = config["PHONE_LOCATIONS"]["HOME_INFERENCE"]["THRESHOLD_STATIC"], - clustering_algorithm = config["PHONE_LOCATIONS"]["HOME_INFERENCE"]["CLUSTERING_ALGORITHM"] - output: - "data/interim/{pid}/phone_locations_processed_with_datetime_with_home.csv" - script: - "../src/data/infer_home_location.py" - rule resample_episodes: input: "data/interim/{pid}/{sensor}_episodes.csv" diff --git a/src/data/infer_home_location.py b/src/data/infer_home_location.py deleted file mode 100644 index 4a5fa342..00000000 --- a/src/data/infer_home_location.py +++ /dev/null @@ -1,140 +0,0 @@ -import pandas as pd -import numpy as np -from sklearn.cluster import DBSCAN,OPTICS -from math import radians, cos, sin, asin, sqrt - -def filterDatafromDf(origDf): - - return origDf[origDf['local_hour']<=6] - -def distance_to_degrees(d): - #Just an approximation, but speeds up clustering by a huge amount and doesnt introduce much error - #over small distances - d = d / 1852 - d = d / 60 - return d - -def cluster_and_label(df,clustering_algorithm,threshold_static,**kwargs): - """ - - :param df: a df with columns "latitude", "longitude", and "datetime" - or - a df with comlumns "latitude","longitude" and a datetime index - :param kwargs: arguments for sklearn's DBSCAN - :return: a new df of labeled locations with moving points removed, where the cluster - labeled as "1" is the largest, "2" the second largest, and so on - """ - if not df.empty: - location_data = df - if not isinstance(df.index, pd.DatetimeIndex): - location_data = df.set_index("local_date_time") - - stationary = mark_moving(location_data,threshold_static) - - counts_df = stationary[["double_latitude" ,"double_longitude"]].groupby(["double_latitude" ,"double_longitude"]).size().reset_index() - counts = counts_df[0] - lat_lon = counts_df[["double_latitude","double_longitude"]].values - - if clustering_algorithm == "DBSCAN": - clusterer = DBSCAN(**kwargs) - cluster_results = clusterer.fit_predict(lat_lon, sample_weight= counts) - else: - clusterer = OPTICS(**kwargs) - cluster_results = clusterer.fit_predict(lat_lon) - - #Need to extend labels back to original df without weights - counts_df["location_label"] = cluster_results - # remove the old count column - del counts_df[0] - - merged = pd.merge(stationary,counts_df, on = ["double_latitude" ,"double_longitude"]) - - #Now compute the label mapping: - cluster_results = merged["location_label"].values - valid_clusters = cluster_results[np.where(cluster_results != -1)] - label_map = rank_count_map(valid_clusters) - - #And remap the labels: - merged.index = stationary.index - stationary = stationary.assign(location_label = merged["location_label"].map(label_map).values) - stationary.loc[:, "location_label"] = merged["location_label"].map(label_map) - return stationary - else: - return df - -def rank_count_map(clusters): - """ Returns a function which will map each element of a list 'l' to its rank, - such that the most common element maps to 1 - - Is used in this context to sort the cluster labels so that cluster with rank 1 is the most - visited. - - If return_dict, return a mapping dict rather than a function - - If a function, if the value can't be found label as -1 - - """ - labels, counts = tuple(np.unique(clusters, return_counts = True)) - sorted_by_count = [x for (y,x) in sorted(zip(counts, labels), reverse = True)] - label_to_rank = {label : rank + 1 for (label, rank) in [(sorted_by_count[i],i) for i in range(len(sorted_by_count))]} - return lambda x: label_to_rank.get(x, -1) - - -def mark_moving(df, threshold_static): - - if not df.index.is_monotonic: - df = df.sort_index() - - distance = haversine(df.double_longitude,df.double_latitude,df.double_longitude.shift(-1),df.double_latitude.shift(-1))/ 1000 - time = (df.timestamp.diff(-1) * -1) / (1000*60*60) - - df['stationary_or_not'] = np.where((distance / time) < threshold_static,1,0) # 1 being stationary,0 for moving - - return df - -def haversine(lon1,lat1,lon2,lat2): - """ - Calculate the great circle distance between two points - on the earth (specified in decimal degrees) - """ - # convert decimal degrees to radians - lon1, lat1, lon2, lat2 = np.radians([lon1, lat1, lon2, lat2]) - - # haversine formula - a = np.sin((lat2-lat1)/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2 - - r = 6371 # Radius of earth in kilometers. Use 3956 for miles - - return (r * 2 * np.arcsin(np.sqrt(a)) * 1000) - -# Infer a participants home location - -origDf = pd.read_csv(snakemake.input[0]) -filteredDf = filterDatafromDf(origDf) -if filteredDf.empty: - filteredDf.to_csv(snakemake.output[0]) -else: - dbscan_eps = snakemake.params["dbscan_eps"] - dbscan_minsamples = snakemake.params["dbscan_minsamples"] - threshold_static = snakemake.params["threshold_static"] - clustering_algorithm = snakemake.params["clustering_algorithm"] - - if clustering_algorithm == "DBSCAN": - hyperparameters = {'eps' : distance_to_degrees(dbscan_eps), 'min_samples': dbscan_minsamples} - elif clustering_algorithm == "OPTICS": - hyperparameters = {'max_eps': distance_to_degrees(dbscan_eps), 'min_samples': 2, 'metric':'euclidean', 'cluster_method' : 'dbscan'} - else: - raise ValueError("config[PHONE_LOCATIONS][HOME_INFERENCE][CLUSTERING ALGORITHM] only accepts DBSCAN or OPTICS but you provided ",clustering_algorithm) - - filteredDf = cluster_and_label(filteredDf,clustering_algorithm,threshold_static,**hyperparameters) - - origDf['home_latitude'] = filteredDf[filteredDf['location_label']==1][['double_latitude','double_longitude']].mean()['double_latitude'] - origDf['home_longitude'] = filteredDf[filteredDf['location_label']==1][['double_latitude','double_longitude']].mean()['double_longitude'] - - distanceFromHome = haversine(origDf.double_longitude,origDf.double_latitude,origDf.home_longitude,origDf.home_latitude) - - finalDf = origDf.drop(['home_latitude','home_longitude'], axis=1) - finalDf.insert(len(finalDf.columns)-1,'distancefromhome',distanceFromHome) - finalDf.to_csv(snakemake.output[0], index=False) - - diff --git a/src/features/phone_locations/doryab/add_doryab_extra_columns.py b/src/features/phone_locations/doryab/add_doryab_extra_columns.py new file mode 100644 index 00000000..eaa7f985 --- /dev/null +++ b/src/features/phone_locations/doryab/add_doryab_extra_columns.py @@ -0,0 +1,138 @@ +import warnings +import numpy as np +import pandas as pd +from doryab_clustering import haversine, create_clustering_hyperparameters, cluster + + + +# Add "is_stationary" column to denote whether it is stationary or not +# "distance" and "speed" columns are also added +def mark_as_stationary(location_data, threshold_static): + + # Distance in meters + location_data = location_data.assign(distance=haversine(location_data["double_longitude"], location_data["double_latitude"], location_data["double_longitude"].shift(-1), location_data["double_latitude"].shift(-1))) + # Speed in km/h + location_data.loc[:, "speed"] = (location_data["distance"] / location_data["duration_in_seconds"]).replace(np.inf, np.nan) * 3.6 + + location_data.loc[:, "is_stationary"] = np.where(location_data["speed"] < threshold_static, 1, 0) + + location_data.dropna(subset=["duration_in_seconds"], inplace=True) + return location_data + +def infer_home_location(location_data, clustering_algorithm, hyperparameters, strategy, days_threshold): + + # Home locations are inferred based on records logged during midnight to 6am. + # The home location is the mean coordinate of the home cluster. + if (strategy == "DORYAB_STRATEGY") or (strategy == "SUN_LI_VEGA_STRATEGY"): + + location_data_filtered = location_data[location_data["local_hour"] < 6] + + if location_data_filtered.empty: + warnings.warn("We could not infer a home location because there are no location records logged during midnight to 6am.") + return pd.DataFrame(columns=location_data_filtered.columns.tolist() + ["distance_from_home", "home_label"]) + + location_data_filtered = cluster(location_data_filtered, clustering_algorithm, **hyperparameters) + + if strategy == "DORYAB_STRATEGY": + + # We assume the participant does not change the home location during the whole study. + # The most common cluster of all nights are regarded as the home cluster. + home_location = location_data_filtered[location_data_filtered["cluster_label"] == 1][["double_latitude", "double_longitude"]].mean() + location_data["distance_from_home"] = haversine(location_data["double_longitude"], location_data["double_latitude"], [home_location["double_longitude"]] * location_data.shape[0], [home_location["double_latitude"]] * location_data.shape[0]) + location_data["home_label"] = 1 + + else: # SUN_LI_VEGA_STRATEGY + + """ + We assume the participant might change the home location during the whole study. + + Each night will be assigned a candidate home location based on the following rules: + if there are records within [03:30:00, 04:30:00]: (group 1) + we choose the most common cluster during that period as the candidate of home cluster. + elif there are records within [midnight, 03:30:00): (group 2) + we choose the last valid cluster during that period as the candidate of home cluster. + elif there are records within (04:30:00, 06:00:00]: (group 3) + we choose the first valid cluster during that period as the candidate of home cluster. + else: + the home location is NA (missing) for that night. + + If the count of consecutive days with the same candidate home location cluster label is larger or equal to MINIMUM_DAYS_TO_DETECT_HOME_CHANGES, + the candidate will be regarded as the home cluster; + otherwise, the home cluster will be the last valid day's cluster. + (If there are no valid clusters before that day, it will be assigned the next valid day's cluster.) + + """ + + # Split location data into 3 groups: [midnight, 03:30:00), [03:30:00, 04:30:00], (04:30:00, 06:00:00] + location_data_filtered = location_data_filtered[~location_data_filtered["cluster_label"].isin([-1, np.nan])] + location_data_filtered["group"] = location_data_filtered["local_time"].apply(lambda x: 1 if x >= "03:30:00" and x <= "04:30:00" else (2 if x < "03:30:00" else 3)) + + # Select the smallest group number per day + selected_groups = location_data_filtered[location_data_filtered["group"] == location_data_filtered.groupby("local_date")["group"].transform("min")][["group", "local_date", "cluster_label"]] + + # For group 1: [03:30:00, 04:30:00] + group_1 = selected_groups[selected_groups["group"] == 1] + home_clusters_group_1 = group_1.groupby(["local_date"]).agg(lambda x: pd.Series.mode(x)[0]) + # For group 2: [midnight, 03:30:00) + group_2 = selected_groups[selected_groups["group"] == 2] + home_clusters_group_2 = group_2.groupby(["local_date"]).last() + # For group 3: (04:30:00, 06:00:00] + group_3 = selected_groups[selected_groups["group"] == 3] + home_clusters_group_3 = group_3.groupby(["local_date"]).first() + + home_clusters = pd.concat([home_clusters_group_1, home_clusters_group_2, home_clusters_group_3]).sort_index() + + # Count the consecutive days with the same candidate home location cluster label + home_clusters["number_of_days"] = home_clusters.groupby((home_clusters["cluster_label"] != home_clusters["cluster_label"].shift(1)).cumsum())["cluster_label"].transform("count") + # Assign the missing days with (1) the last valid day's cluster first and (2) the next valid day's cluster then + home_clusters.loc[home_clusters["number_of_days"] < days_threshold, "cluster_label"] = np.nan + location_data = location_data.merge(home_clusters[["cluster_label"]], left_on="local_date", right_index=True, how="left") + location_data["cluster_label"] = location_data["cluster_label"].fillna(method="ffill").fillna(method="bfill") + + center_per_cluster = location_data_filtered.groupby(["cluster_label"])[["double_latitude", "double_longitude"]].mean().rename(columns={"double_latitude": "home_latitude", "double_longitude": "home_longitude"}) + location_data = location_data.merge(center_per_cluster, left_on="cluster_label", right_index=True, how="left") + location_data["distance_from_home"] = haversine(location_data["double_longitude"], location_data["double_latitude"], location_data["home_longitude"], location_data["home_latitude"]) + + # reorder cluster labels + reorder_mapping = {old_label: idx + 1 for idx, old_label in enumerate(location_data["cluster_label"].unique())} + location_data["home_label"] = location_data["cluster_label"].map(reorder_mapping) + + location_data.drop(["cluster_label", "home_longitude", "home_latitude"], axis=1, inplace=True) + + return location_data + + + +location_data = pd.read_csv(snakemake.input["sensor_input"]) +accuracy_limit = snakemake.params["accuracy_limit"] +maximum_row_gap = snakemake.params["maximum_row_gap"] +dbscan_eps = snakemake.params["dbscan_eps"] +dbscan_minsamples = snakemake.params["dbscan_minsamples"] +threshold_static = snakemake.params["threshold_static"] +clustering_algorithm = snakemake.params["clustering_algorithm"] +cluster_on = snakemake.params["cluster_on"] +strategy = snakemake.params["infer_home_location_strategy"] +days_threshold = snakemake.params["minimum_days_to_detect_home_changes"] + +rows_before_accuracy_filter = len(location_data) +location_data = location_data[location_data["accuracy"] < accuracy_limit] + +if rows_before_accuracy_filter > 0 and len(location_data) == 0: + warnings.warn("Cannot compute Doryab location features because there are no rows with an accuracy value lower than ACCURACY_LIMIT: {}".format(accuracy_limit)) + +if not location_data.timestamp.is_monotonic: + location_data.sort_values(by=["timestamp"], inplace=True) + +location_data["duration_in_seconds"] = -1 * location_data.timestamp.diff(-1) / 1000 +location_data.loc[location_data["duration_in_seconds"] >= maximum_row_gap, "duration_in_seconds"] = np.nan + +location_data = mark_as_stationary(location_data, threshold_static) + +hyperparameters = create_clustering_hyperparameters(clustering_algorithm, dbscan_eps, dbscan_minsamples) +location_data_with_doryab_columns = infer_home_location(location_data, clustering_algorithm, hyperparameters, strategy, days_threshold) + +if cluster_on == "PARTICIPANT_DATASET": + location_data_with_doryab_columns = cluster(location_data_with_doryab_columns, clustering_algorithm, **hyperparameters) + +location_data_with_doryab_columns.to_csv(snakemake.output[0], index=False) + diff --git a/src/features/phone_locations/doryab/doryab_clustering.py b/src/features/phone_locations/doryab/doryab_clustering.py new file mode 100644 index 00000000..f62f4df8 --- /dev/null +++ b/src/features/phone_locations/doryab/doryab_clustering.py @@ -0,0 +1,77 @@ +import pandas as pd +import numpy as np +from sklearn.cluster import DBSCAN, OPTICS + + + +# Calculate the great-circle distance (in meters) between two points on the earth (specified in decimal degrees) +def haversine(lon1, lat1, lon2, lat2): + # Radius of earth in kilometers. Use 3956 for miles + r = 6371 + # Convert decimal degrees to radians + lon1, lat1, lon2, lat2 = np.radians([lon1, lat1, lon2, lat2]) + # Haversine formula + distance = r * 2 * np.arcsin(np.sqrt(np.sin((lat2 - lat1) / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin((lon2 - lon1) / 2.0) ** 2)) * 1000 + return distance + +# Just an approximation, but speeds up clustering by a huge amount and doesn't introduce much error over small distances +# Reference: https://jonisalonen.com/2014/computing-distance-between-coordinates-can-be-simple-and-fast/ +def meters_to_degrees(distance): + # Convert meter to nautical mile + distance = distance / 1852 + # Convert nautical mile to degree + distance = distance / 60 + return distance + +# Relabel clusters: -1 denotes the outliers (insignificant or rarely visited locations), 1 denotes the most visited significant location, 2 denotes the 2nd most significant location,... +def label(location_data): + + # Exclude outliers (cluster_label = -1) while counting number of locations in a cluster + label2count = pd.DataFrame({"count": location_data["cluster_label"].replace(-1, np.nan).value_counts(ascending=False, sort=True)}) + # Add the row number as the new cluster label since value_counts() will order it by default + label2count["new_cluster_label"] = np.arange(len(label2count)) + 1 + # Still use -1 to denote the outliers + label2count.loc[-1, "new_cluster_label"] = -1 + # Merge the new cluster label with the original location data + location_data = location_data.merge(label2count[["new_cluster_label"]], left_on="cluster_label", right_index=True, how="left") + + del location_data["cluster_label"] + location_data.rename(columns={"new_cluster_label": "cluster_label"}, inplace=True) + + return location_data + +def create_clustering_hyperparameters(clustering_algorithm, dbscan_eps, dbscan_minsamples): + if clustering_algorithm == "DBSCAN": + hyperparameters = {"eps": meters_to_degrees(dbscan_eps), "min_samples": dbscan_minsamples} + else: # OPTICS + hyperparameters = {"max_eps": meters_to_degrees(dbscan_eps), "min_samples": dbscan_minsamples, "metric": "euclidean", "cluster_method": "dbscan"} + + return hyperparameters + +# Only stationary samples are clustered, hence moving samples are labeled with NA +def cluster(location_data, clustering_algorithm, **kwargs): + + if location_data.empty: + return pd.DataFrame(columns=location_data.columns.tolist() + ["is_stationary", "cluster_label"]) + + # Only keep stationary samples for clustering + stationary_data = location_data[location_data["is_stationary"] == 1][["double_latitude", "double_longitude", "is_stationary"]] + + # Remove duplicates and apply sample_weight (only available for DBSCAN currently) to reduce memory usage + stationary_data_dedup = stationary_data.groupby(["double_latitude", "double_longitude", "is_stationary"]).size().reset_index() + lat_lon = stationary_data_dedup[["double_latitude", "double_longitude"]].values + + if stationary_data_dedup.shape[0] < kwargs["min_samples"]: + cluster_results = np.array([-1] * stationary_data_dedup.shape[0]) + elif clustering_algorithm == "DBSCAN": + clusterer = DBSCAN(**kwargs) + cluster_results = clusterer.fit_predict(lat_lon, sample_weight=stationary_data_dedup[0]) + else: # OPTICS + clusterer = OPTICS(**kwargs) + cluster_results = clusterer.fit_predict(lat_lon) + + # Add cluster labels + stationary_data_dedup["cluster_label"] = cluster_results + location_data_with_labels = label(location_data.merge(stationary_data_dedup[["double_latitude", "double_longitude", "is_stationary", "cluster_label"]], how="left", on=["double_latitude", "double_longitude", "is_stationary"])) + + return location_data_with_labels diff --git a/src/features/phone_locations/doryab/main.py b/src/features/phone_locations/doryab/main.py index 1913377a..c779279f 100644 --- a/src/features/phone_locations/doryab/main.py +++ b/src/features/phone_locations/doryab/main.py @@ -1,427 +1,156 @@ -import pandas as pd import numpy as np -import warnings -from astropy.timeseries import LombScargle -from sklearn.cluster import DBSCAN,OPTICS -from math import radians, cos, sin, asin, sqrt +import pandas as pd +from phone_locations.doryab.doryab_clustering import haversine, create_clustering_hyperparameters, cluster + + + +def apply_cluster_strategy(location_data, time_segment, clustering_algorithm, dbscan_eps, dbscan_minsamples, cluster_on, filter_data_by_segment): + + hyperparameters = create_clustering_hyperparameters(clustering_algorithm, dbscan_eps, dbscan_minsamples) + + if cluster_on == "PARTICIPANT_DATASET": + # clusters are created in cluster_accross_participant_dataset.py script + location_data = filter_data_by_segment(location_data, time_segment) + elif cluster_on == "TIME_SEGMENT": + location_data = filter_data_by_segment(location_data, time_segment) + location_data = cluster(location_data, clustering_algorithm, **hyperparameters) + else: # TIME_SEGMENT_INSTANCE + location_data = filter_data_by_segment(location_data, time_segment) + location_data_clusters = pd.DataFrame() + for segment_instance, instance_data in location_data.groupby(["local_segment"]): + location_data_per_group = cluster(instance_data, clustering_algorithm, **hyperparameters) + location_data_clusters = pd.concat([location_data_per_group, location_data_clusters]) + location_data = location_data_clusters + return location_data + +def distance_and_speed_features(moving_data): + + distance_and_speed = moving_data[["local_segment", "distance"]].groupby(["local_segment"]).sum().rename(columns={"distance": "totaldistance"}) + + moving_data_grouped = moving_data[["local_segment", "speed"]].groupby(["local_segment"]) + distance_and_speed["avgspeed"] = moving_data_grouped["speed"].mean() + distance_and_speed["varspeed"] = moving_data_grouped["speed"].var() + + return distance_and_speed + +def radius_of_gyration(location_data): + + # center is the centroid of the places visited during a segment instance, not the home location + clusters = location_data.groupby(["local_segment", "cluster_label"]).agg( + double_latitude=("double_latitude", "mean"), + double_longitude=("double_longitude", "mean"), + time_in_a_cluster=("duration_in_seconds", "sum") + ).reset_index() + + clusters[["centroid_double_latitude", "centroid_double_longitude"]] = clusters.groupby(["local_segment"], sort=False)[["double_latitude", "double_longitude"]].transform("mean") + clusters["distance_squared"] = haversine(clusters["double_longitude"], clusters["double_latitude"], clusters["centroid_double_longitude"], clusters["centroid_double_latitude"]) ** 2 + + clusters["distance_squared_X_time_in_a_cluster"] = clusters["distance_squared"] * clusters["time_in_a_cluster"] + rog = np.sqrt(clusters.groupby(["local_segment"])["distance_squared_X_time_in_a_cluster"].sum() / clusters.groupby(["local_segment"])["time_in_a_cluster"].sum().replace(0, np.inf)) + + return rog + +def cluster_stay(x, stay_at_clusters, cluster_n): + topn_cluster_label = x[stay_at_clusters.loc[x.index]["cluster_label"] == cluster_n] + time_at_topn = topn_cluster_label.iloc[0] if len(topn_cluster_label) == 1 else None + return time_at_topn + +def stay_at_topn_clusters(location_data): + + stay_at_clusters = location_data[["local_segment", "cluster_label", "duration_in_seconds"]].groupby(["local_segment", "cluster_label"], sort=True).sum().reset_index() + stay_at_clusters["duration_in_minutes"] = stay_at_clusters["duration_in_seconds"] / 60 + + stay_at_clusters_features = stay_at_clusters.groupby(["local_segment"]).agg( + timeattop1location=("duration_in_minutes", lambda x: cluster_stay(x, stay_at_clusters, 1)), + timeattop2location=("duration_in_minutes", lambda x: cluster_stay(x, stay_at_clusters, 2)), + timeattop3location=("duration_in_minutes", lambda x: cluster_stay(x, stay_at_clusters, 3)), + maxlengthstayatclusters=("duration_in_minutes", "max"), + minlengthstayatclusters=("duration_in_minutes", "min"), + avglengthstayatclusters=("duration_in_minutes", "mean"), + stdlengthstayatclusters=("duration_in_minutes", "std") + ).fillna(0) + + return stay_at_clusters_features + +def location_entropy(location_data): + + location_data = location_data.groupby(["local_segment", "cluster_label"])[["duration_in_seconds"]].sum().reset_index().rename(columns={"duration_in_seconds": "cluster_duration"}) + location_data["all_clusters_duration"] = location_data.groupby(["local_segment"])["cluster_duration"].transform("sum") + location_data["plogp"] = (location_data["cluster_duration"] / location_data["all_clusters_duration"]).apply(lambda x: x * np.log(x)) + + entropy = -1 * location_data.groupby(["local_segment"])[["plogp"]].sum().rename(columns={"plogp": "locationentropy"}) + + entropy["num_clusters"] = location_data.groupby(["local_segment"])["cluster_label"].nunique() + entropy["normalizedlocationentropy"] = entropy["locationentropy"] / entropy["num_clusters"] + + return entropy + + def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): location_data = pd.read_csv(sensor_data_files["sensor_data"]) requested_features = provider["FEATURES"] - accuracy_limit = provider["ACCURACY_LIMIT"] dbscan_eps = provider["DBSCAN_EPS"] dbscan_minsamples = provider["DBSCAN_MINSAMPLES"] - threshold_static = provider["THRESHOLD_STATIC"] - maximum_gap_allowed = provider["MAXIMUM_ROW_GAP"] - maximum_row_duration = provider["MAXIMUM_ROW_DURATION"] cluster_on = provider["CLUSTER_ON"] clustering_algorithm = provider["CLUSTERING_ALGORITHM"] radius_from_home = provider["RADIUS_FOR_HOME"] - minutes_data_used = provider["MINUTES_DATA_USED"] - if(minutes_data_used): - requested_features.append("minutesdataused") + if provider["MINUTES_DATA_USED"]: + requested_features.append("minutesdataused") # name of the features this function can compute - base_features_names = ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","minutesdataused","timeathome"] + base_features_names = ["locationvariance","loglocationvariance","totaldistance","avgspeed","varspeed","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","avglengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","minutesdataused","timeathome","homelabel"] # the subset of requested features this function can compute features_to_compute = list(set(requested_features) & set(base_features_names)) - - if clustering_algorithm == "DBSCAN": - hyperparameters = {'eps' : distance_to_degrees(dbscan_eps), 'min_samples': dbscan_minsamples} - elif clustering_algorithm == "OPTICS": - hyperparameters = {'max_eps': distance_to_degrees(dbscan_eps), 'min_samples': 2, 'metric':'euclidean', 'cluster_method' : 'dbscan'} - else: - raise ValueError("config[PHONE_LOCATIONS][DORYAB][CLUSTERING ALGORITHM] only accepts DBSCAN or OPTICS but you provided ",clustering_algorithm) - rows_before_accuracy_filter = len(location_data) - location_data.query("accuracy < @accuracy_limit", inplace=True) - if rows_before_accuracy_filter > 0 and len(location_data) == 0: - warnings.warn("Cannot compute Doryab location features because there are no rows with an accuracy value lower than ACCURACY_LIMIT: {}".format(accuracy_limit)) + location_data = apply_cluster_strategy(location_data, time_segment, clustering_algorithm, dbscan_eps, dbscan_minsamples, cluster_on, filter_data_by_segment) if location_data.empty: - location_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) - else: - if cluster_on == "PARTICIPANT_DATASET": - location_data = cluster_and_label(location_data,clustering_algorithm,threshold_static,**hyperparameters) - location_data = filter_data_by_segment(location_data, time_segment) - elif cluster_on == "TIME_SEGMENT": - location_data = filter_data_by_segment(location_data, time_segment) - location_data = cluster_and_label(location_data,clustering_algorithm,threshold_static,**hyperparameters) - else: - raise ValueError("config[PHONE_LOCATIONS][DORYAB][CLUSTER_ON] only accepts PARTICIPANT_DATASET or TIME_SEGMENT but you provided ",cluster_on) + return pd.DataFrame(columns=["local_segment"] + features_to_compute) + location_features = pd.DataFrame() - if location_data.empty: - location_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) - else: - location_features = pd.DataFrame() + location_features["minutesdataused"] = location_data.drop_duplicates(subset=["local_segment", "local_date", "local_hour", "local_minute"])[["local_segment", "local_minute"]].groupby(["local_segment"])["local_minute"].count() - if "minutesdataused" in features_to_compute: - for localDate in location_data["local_segment"].unique(): - location_features.loc[localDate,"minutesdataused"] = getMinutesData(location_data[location_data["local_segment"]==localDate]) + # variance features + location_features["locationvariance"] = location_data.groupby(["local_segment"])["double_latitude"].var() + location_data.groupby(["local_segment"])["double_longitude"].var() + location_features["loglocationvariance"] = np.log10(location_features["locationvariance"]).replace(-np.inf, np.nan) - location_features.index.name = 'local_segment' - - location_data = location_data[(location_data['double_latitude']!=0.0) & (location_data['double_longitude']!=0.0)] + # distance and speed features + moving_data = location_data[location_data["is_stationary"] == 0] + location_features = location_features.merge(distance_and_speed_features(moving_data), how="outer", left_index=True, right_index=True) - if location_data.empty: - location_features = pd.DataFrame(columns=["local_segment"] + ["location_" + time_segment + "_" + x for x in features_to_compute]) - location_features = location_features.reset_index(drop=True) - return location_features + # stationary features + stationary_data = location_data[location_data["is_stationary"] == 1] + stationary_data_without_outliers = stationary_data[stationary_data["cluster_label"] != -1] - location_data['timeInSeconds'] = (location_data.timestamp.diff(-1)* -1)/1000 - if "locationvariance" in features_to_compute: - location_features["locationvariance"] = location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var() - - if "loglocationvariance" in features_to_compute: - location_features["loglocationvariance"] = (location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var()).apply(lambda x: np.log10(x) if x > 0 else None) + location_features["numberofsignificantplaces"] = stationary_data_without_outliers.groupby(["local_segment"])["cluster_label"].nunique() + # number of location transitions: ignores transitions from moving to static and vice-versa, but counts transitions from outliers to major location clusters + location_features["numberlocationtransitions"] = stationary_data[["local_segment", "cluster_label"]].groupby(["local_segment"])["cluster_label"].apply(lambda x: np.sum(x != x.shift()) - 1) + location_features["radiusgyration"] = radius_of_gyration(stationary_data_without_outliers) + + # stay at topn clusters features + location_features = location_features.merge(stay_at_topn_clusters(stationary_data_without_outliers), how="outer", left_index=True, right_index=True) - - preComputedDistanceandSpeed = pd.DataFrame() - for localDate in location_data['local_segment'].unique(): - speeddf = get_all_travel_distances_meters_speed(location_data[location_data['local_segment']==localDate],threshold_static,maximum_gap_allowed) - preComputedDistanceandSpeed.loc[localDate,"distance"] = speeddf['distances'].sum() - preComputedDistanceandSpeed.loc[localDate,"avgspeed"] = speeddf[speeddf['speedTag'] == 'Moving']['speed'].mean() - preComputedDistanceandSpeed.loc[localDate,"varspeed"] = speeddf[speeddf['speedTag'] == 'Moving']['speed'].var() + # moving to static ratio + static_time = stationary_data.groupby(["local_segment"])["duration_in_seconds"].sum() + total_time = location_data.groupby(["local_segment"])["duration_in_seconds"].sum() + location_features["movingtostaticratio"] = static_time / total_time - if "totaldistance" in features_to_compute: - for localDate in location_data['local_segment'].unique(): - location_features.loc[localDate,"totaldistance"] = preComputedDistanceandSpeed.loc[localDate,"distance"] + # outliers time percent + outliers_time = stationary_data[stationary_data["cluster_label"] == -1].groupby(["local_segment"])["duration_in_seconds"].sum() + location_features["outlierstimepercent"] = outliers_time / static_time - if "averagespeed" in features_to_compute: - for localDate in location_data['local_segment'].unique(): - location_features.loc[localDate,"averagespeed"] = preComputedDistanceandSpeed.loc[localDate,"avgspeed"] + # entropy features + location_features = location_features.merge(location_entropy(stationary_data_without_outliers), how="outer", left_index=True, right_index=True) - if "varspeed" in features_to_compute: - for localDate in location_data['local_segment'].unique(): - location_features.loc[localDate,"varspeed"] = preComputedDistanceandSpeed.loc[localDate,"varspeed"] + # time at home + location_features["timeathome"] = stationary_data[stationary_data["distance_from_home"] <= radius_from_home].groupby(["local_segment"])["duration_in_seconds"].sum() / 60 - if "circadianmovement" in features_to_compute: - for localDate in location_data['local_segment'].unique(): - location_features.loc[localDate,"circadianmovement"] = circadian_movement(location_data[location_data['local_segment']==localDate]) + # home label + location_features["homelabel"] = stationary_data[["local_segment", "home_label"]].groupby(["local_segment"]).agg(lambda x: pd.Series.mode(x)[0]) - - stationaryLocations = location_data[location_data['stationary_or_not'] == 1] - - if "numberofsignificantplaces" in features_to_compute: - for localDate in stationaryLocations['local_segment'].unique(): - location_features.loc[localDate,"numberofsignificantplaces"] = number_of_significant_places(stationaryLocations[stationaryLocations['local_segment']==localDate]) - - if "numberlocationtransitions" in features_to_compute: - for localDate in stationaryLocations['local_segment'].unique(): - location_features.loc[localDate,"numberlocationtransitions"] = number_location_transitions(stationaryLocations[stationaryLocations['local_segment']==localDate]) - - if "radiusgyration" in features_to_compute: - for localDate in stationaryLocations['local_segment'].unique(): - location_features.loc[localDate,"radiusgyration"] = radius_of_gyration(stationaryLocations[stationaryLocations['local_segment']==localDate]) - - preComputedTimeArray = pd.DataFrame() - for localDate in stationaryLocations["local_segment"].unique(): - top1,top2,top3,smax,smin,sstd,smean = len_stay_timeattopn(stationaryLocations[stationaryLocations["local_segment"]==localDate],maximum_gap_allowed,maximum_row_duration) - preComputedTimeArray.loc[localDate,"timeattop1"] = top1 - preComputedTimeArray.loc[localDate,"timeattop2"] = top2 - preComputedTimeArray.loc[localDate,"timeattop3"] = top3 - preComputedTimeArray.loc[localDate,"maxlengthstayatclusters"] = smax - preComputedTimeArray.loc[localDate,"minlengthstayatclusters"] = smin - preComputedTimeArray.loc[localDate,"stdlengthstayatclusters"] = sstd - preComputedTimeArray.loc[localDate,"meanlengthstayatclusters"] = smean - - if "timeattop1location" in features_to_compute: - for localDate in stationaryLocations['local_segment'].unique(): - location_features.loc[localDate,"timeattop1"] = preComputedTimeArray.loc[localDate,"timeattop1"] - - if "timeattop2location" in features_to_compute: - for localDate in stationaryLocations['local_segment'].unique(): - location_features.loc[localDate,"timeattop2"] = preComputedTimeArray.loc[localDate,"timeattop2"] - - if "timeattop3location" in features_to_compute: - for localDate in stationaryLocations['local_segment'].unique(): - location_features.loc[localDate,"timeattop3"] = preComputedTimeArray.loc[localDate,"timeattop3"] - - if "movingtostaticratio" in features_to_compute: - for localDate in stationaryLocations['local_segment'].unique(): - location_features.loc[localDate,"movingtostaticratio"] = (stationaryLocations[stationaryLocations['local_segment']==localDate]['timeInSeconds'].sum()) / (location_data[location_data['local_segment']==localDate]['timeInSeconds'].sum()) - - if "outlierstimepercent" in features_to_compute: - for localDate in stationaryLocations['local_segment'].unique(): - location_features.loc[localDate,"outlierstimepercent"] = outlier_time_percent_new(stationaryLocations[stationaryLocations['local_segment']==localDate]) - - if "maxlengthstayatclusters" in features_to_compute: - for localDate in stationaryLocations['local_segment'].unique(): - location_features.loc[localDate,"maxlengthstayatclusters"] = preComputedTimeArray.loc[localDate,"maxlengthstayatclusters"] - - if "minlengthstayatclusters" in features_to_compute: - for localDate in stationaryLocations['local_segment'].unique(): - location_features.loc[localDate,"minlengthstayatclusters"] = preComputedTimeArray.loc[localDate,"minlengthstayatclusters"] - - if "stdlengthstayatclusters" in features_to_compute: - for localDate in stationaryLocations['local_segment'].unique(): - location_features.loc[localDate,"stdlengthstayatclusters"] = preComputedTimeArray.loc[localDate,"stdlengthstayatclusters"] - - if "meanlengthstayatclusters" in features_to_compute: - for localDate in stationaryLocations['local_segment'].unique(): - location_features.loc[localDate,"meanlengthstayatclusters"] = preComputedTimeArray.loc[localDate,"meanlengthstayatclusters"] - - if "locationentropy" in features_to_compute: - for localDate in stationaryLocations['local_segment'].unique(): - location_features.loc[localDate,"locationentropy"] = location_entropy(stationaryLocations[stationaryLocations['local_segment']==localDate]) - - if "normalizedlocationentropy" in features_to_compute: - for localDate in stationaryLocations['local_segment'].unique(): - location_features.loc[localDate,"normalizedlocationentropy"] = location_entropy_normalized(stationaryLocations[stationaryLocations['local_segment']==localDate]) - - if "timeathome" in features_to_compute: - calculationDf = stationaryLocations[['local_segment','distancefromhome','timeInSeconds']].copy() - calculationDf.loc[calculationDf.timeInSeconds >= maximum_gap_allowed,'timeInSeconds'] = maximum_row_duration - location_features["timeathome"] = calculationDf[calculationDf["distancefromhome"] <= radius_from_home].groupby("local_segment")["timeInSeconds"].sum()/60 - - location_features = location_features.reset_index() + location_features = location_features[features_to_compute].reset_index() return location_features - -def len_stay_timeattopn(locationData,maximum_gap_allowed,maximum_row_duration): - if locationData is None or len(locationData) == 0: - return (None, None, None,None, None, None, None) - - calculationDf = locationData[locationData["location_label"] >= 1][['location_label','timeInSeconds']].copy() - calculationDf.loc[calculationDf.timeInSeconds >= maximum_gap_allowed,'timeInSeconds'] = maximum_row_duration - timeArray = calculationDf.groupby('location_label')['timeInSeconds'].sum().reset_index()['timeInSeconds'].sort_values(ascending=False)/60 - - if len(timeArray) == 3: - return (timeArray[0],timeArray[1],timeArray[2],timeArray.max(),timeArray.min(),timeArray.std(),timeArray.mean()) - elif len(timeArray)==2: - return (timeArray[0],timeArray[1],None,timeArray.max(),timeArray.min(),timeArray.std(),timeArray.mean()) - elif len(timeArray)==1: - return (timeArray[0],None,None,timeArray.max(),timeArray.min(),timeArray.std(),timeArray.mean()) - else: - return (None,None,None,timeArray.max(),timeArray.min(),timeArray.std(),timeArray.mean()) - - -def getMinutesData(locationData): - - return locationData[['local_hour','local_minute']].drop_duplicates(inplace = False).shape[0] - -def distance_to_degrees(d): - #Just an approximation, but speeds up clustering by a huge amount and doesnt introduce much error - #over small distances - d = d / 1852 - d = d / 60 - return d - -def get_all_travel_distances_meters_speed(locationData,threshold,maximum_gap_allowed): - - lat_lon_temp = locationData[locationData['timeInSeconds'] <= maximum_gap_allowed][['double_latitude','double_longitude','timeInSeconds']] - - if lat_lon_temp.empty: - return pd.DataFrame({"speed": [], "speedTag": [],"distances": []}) - - lat_lon_temp['distances'] = haversine(lat_lon_temp['double_longitude'],lat_lon_temp['double_latitude'],lat_lon_temp['double_longitude'].shift(-1),lat_lon_temp['double_latitude'].shift(-1)) - lat_lon_temp['speed'] = (lat_lon_temp['distances'] / lat_lon_temp['timeInSeconds'] ) # meter/second - lat_lon_temp['speed'] = lat_lon_temp['speed'].replace(np.inf, np.nan) * 3.6 - - lat_lon_temp['speedTag'] = np.where(lat_lon_temp['speed'] >= threshold,"Moving","Static") - - return lat_lon_temp[['speed','speedTag','distances']] - - -def vincenty_row(x): - """ - :param x: A row from a dataframe - :return: The distance in meters between - """ - - try: - return vincenty((x['_lat_before'], x['_lon_before']),(x['_lat_after'], x['_lon_after'])).meters - - except: - return 0 - -def haversine(lon1,lat1,lon2,lat2): - """ - Calculate the great circle distance between two points - on the earth (specified in decimal degrees) - """ - # convert decimal degrees to radians - lon1, lat1, lon2, lat2 = np.radians([lon1, lat1, lon2, lat2]) - - # haversine formula - a = np.sin((lat2-lat1)/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2 - - r = 6371 # Radius of earth in kilometers. Use 3956 for miles - - return (r * 2 * np.arcsin(np.sqrt(a)) * 1000) - - -def circadian_movement_energies(locationData): - time = (locationData["timestamp"].values / 1000.0) # seconds - ylat = locationData["double_latitude"].values - ylong = locationData["double_longitude"].values - hours_intervals = np.arange(23.5, 24.51, 0.01) # hours - seconds_intervals = hours_intervals * 60 * 60 # seconds - frequency = 1 / seconds_intervals - - power_latitude = LombScargle(time, ylat).power(frequency=frequency, normalization='psd') - power_longitude = LombScargle(time, ylong).power(frequency=frequency, normalization='psd') - - energy_latitude = np.sum(power_latitude) - energy_longitude = np.sum(power_longitude) - return (energy_latitude, energy_longitude) - -def circadian_movement(locationData): - - energy_latitude, energy_longitude = circadian_movement_energies(locationData) - return np.log10(energy_latitude + energy_longitude) - -def cluster_and_label(df,clustering_algorithm,threshold_static,**kwargs): - """ - - :param df: a df with columns "latitude", "longitude", and "datetime" - or - a df with comlumns "latitude","longitude" and a datetime index - :param kwargs: arguments for sklearn's DBSCAN - :return: a new df of labeled locations with moving points removed, where the cluster - labeled as "1" is the largest, "2" the second largest, and so on - """ - if not df.empty: - location_data = df - if not isinstance(df.index, pd.DatetimeIndex): - location_data = df.set_index("local_date_time") - - stationary = mark_moving(location_data,threshold_static) - - counts_df = stationary[["double_latitude" ,"double_longitude"]].groupby(["double_latitude" ,"double_longitude"]).size().reset_index() - counts = counts_df[0] - lat_lon = counts_df[["double_latitude","double_longitude"]].values - - if clustering_algorithm == "DBSCAN": - clusterer = DBSCAN(**kwargs) - cluster_results = clusterer.fit_predict(lat_lon, sample_weight= counts) - else: - clusterer = OPTICS(**kwargs) - cluster_results = clusterer.fit_predict(lat_lon) - - #Need to extend labels back to original df without weights - counts_df["location_label"] = cluster_results - # remove the old count column - del counts_df[0] - - merged = pd.merge(stationary,counts_df, on = ["double_latitude" ,"double_longitude"]) - - #Now compute the label mapping: - cluster_results = merged["location_label"].values - valid_clusters = cluster_results[np.where(cluster_results != -1)] - label_map = rank_count_map(valid_clusters) - - #And remap the labels: - merged.index = stationary.index - stationary = stationary.assign(location_label = merged["location_label"].map(label_map).values) - stationary.loc[:, "location_label"] = merged["location_label"].map(label_map) - return stationary - else: - return df - -def rank_count_map(clusters): - """ Returns a function which will map each element of a list 'l' to its rank, - such that the most common element maps to 1 - - Is used in this context to sort the cluster labels so that cluster with rank 1 is the most - visited. - - If return_dict, return a mapping dict rather than a function - - If a function, if the value can't be found label as -1 - - """ - labels, counts = tuple(np.unique(clusters, return_counts = True)) - sorted_by_count = [x for (y,x) in sorted(zip(counts, labels), reverse = True)] - label_to_rank = {label : rank + 1 for (label, rank) in [(sorted_by_count[i],i) for i in range(len(sorted_by_count))]} - return lambda x: label_to_rank.get(x, -1) - - -def mark_moving(df, threshold_static): - - if not df.index.is_monotonic: - df = df.sort_index() - - distance = haversine(df.double_longitude,df.double_latitude,df.double_longitude.shift(-1),df.double_latitude.shift(-1))/ 1000 - time = (df.timestamp.diff(-1) * -1) / (1000*60*60) - - df['stationary_or_not'] = np.where((distance / time) < threshold_static,1,0) # 1 being stationary,0 for moving - - return df - -def number_of_significant_places(locationData): - - uniquelst = locationData[locationData["location_label"] >= 1]["location_label"].unique() - return len(uniquelst) - -def number_location_transitions(locationData): - - # ignores transitions from moving to static and vice-versa, but counts transitions from outliers to major location clusters - df = pd.DataFrame() - - df['boolCol'] = (locationData.location_label == locationData.location_label.shift()) - - return df[df['boolCol'] == False].shape[0] - 1 - -def radius_of_gyration(locationData): - if locationData is None or len(locationData) == 0: - return None - # Center is the centroid, not the home location - valid_clusters = locationData[locationData["location_label"] != -1] - centroid_all_clusters = (valid_clusters.groupby('location_label')[['double_latitude','double_longitude']].mean()).mean() - clusters_centroid = valid_clusters.groupby('location_label')[['double_latitude','double_longitude']].mean() - - rog = 0 - for labels in clusters_centroid.index: - distance = haversine(clusters_centroid.loc[labels].double_longitude,clusters_centroid.loc[labels].double_latitude, - centroid_all_clusters.double_longitude,centroid_all_clusters.double_latitude) ** 2 - - time_in_cluster = locationData[locationData["location_label"]==labels]['timeInSeconds'].sum() - rog = rog + (time_in_cluster * distance) - - time_all_clusters = valid_clusters['timeInSeconds'].sum() - if time_all_clusters == 0: - return 0 - final_rog = (1/time_all_clusters) * rog - - return np.sqrt(final_rog) - -def outlier_time_percent_new(locationData): - if locationData is None or len(locationData)==0: - return None - - clustersDf = locationData[["location_label","timeInSeconds"]] - numoutliers = clustersDf[clustersDf["location_label"]== -1]["timeInSeconds"].sum() - numtotal = clustersDf.timeInSeconds.sum() - - return numoutliers/numtotal - -def location_entropy(locationData): - if locationData is None or len(locationData) == 0: - return None - - clusters = locationData[locationData["location_label"] >= 1] # remove outliers/ cluster noise - if len(clusters) > 0: - # Get percentages for each location - percents = clusters.groupby(['location_label'])['timeInSeconds'].sum() / clusters['timeInSeconds'].sum() - entropy = -1 * percents.map(lambda x: x * np.log(x)).sum() - return entropy - else: - return None - -def location_entropy_normalized(locationData): - if locationData is None or len(locationData) == 0: - return None - - locationData = locationData[locationData["location_label"] >= 1] # remove outliers/ cluster noise - entropy = location_entropy(locationData) - unique_clusters = locationData["location_label"].unique() - num_clusters = len(unique_clusters) - if num_clusters == 0 or len(locationData) == 0 or entropy is None: - return None - elif np.log(num_clusters)==0: - return None - else: - return entropy / np.log(num_clusters) diff --git a/tests/data/external/aware_csv/phone_locations_raw.csv b/tests/data/external/aware_csv/phone_locations_raw.csv new file mode 100644 index 00000000..fcd714c7 --- /dev/null +++ b/tests/data/external/aware_csv/phone_locations_raw.csv @@ -0,0 +1,53 @@ +timestamp,device_id,double_latitude,double_longitude,double_bearing,double_speed,double_altitude,provider,accuracy +1583596560000,android,-100.0,-100.0,0.0,0.0,100,gps,800 +1583596620000,android,-100.0,-100.0,0.0,0.0,100,gps,800 +1583596680000,android,-100.000001,-100.000001,0.0,0.0,100,gps,10 +1583596740000,android,-100.000001,-100.0,0.0,0.0,100,gps,10 +1583596800000,android,-100.000001,-100.0,0.0,0.0,100,gps,800 +1583596860000,android,-100.0,-100.0,0.0,0.0,100,gps,10 +1583596920000,android,-99.999999,-100.0,0.0,0.0,100,gps,800 +1583596980000,android,-99.999999,-99.999999,0.0,0.0,100,gps,10 +1583597040000,android,-99.999999,-99.999999,0.0,0.0,100,gps,10 +1583652600000,android,1.0,1.0000120000000001,0.0,0.0,100,gps,10 +1583652660000,android,1.0,1.0000120000000001,0.0,0.0,100,gps,10 +1583652720000,android,1.000001,1.0,0.0,0.0,100,gps,10 +1583652780000,android,1.0,1.0,0.0,0.0,100,gps,10 +1583652840000,android,1.0,1.0,0.0,0.0,100,gps,10 +1583652900000,android,0.999999,1.0,0.0,0.0,100,gps,10 +1583652960000,android,1.0,0.9999879999999999,0.0,0.0,100,gps,10 +1583653620000,android,1.0,0.9999899999999999,0.0,0.0,100,gps,800 +1583681400000,android,100.0,100.0,0.0,0.0,100,gps,800 +1583681460000,android,100.0,100.0,0.0,0.0,100,gps,800 +1583681520000,android,100.000001,100.000001,0.0,0.0,100,gps,10 +1583681580000,android,100.000001,100.0,0.0,0.0,100,gps,10 +1583681640000,android,100.000001,100.0,0.0,0.0,100,gps,800 +1583681700000,android,100.0,100.0,0.0,0.0,100,gps,10 +1583681760000,android,99.999999,100.0,0.0,0.0,100,gps,800 +1583681820000,android,99.999999,99.999999,0.0,0.0,100,gps,10 +1583681880000,android,99.999999,99.999999,0.0,0.0,100,gps,10 +1604156160000,android,-100.0,-100.0,0.0,0.0,100,gps,800 +1604156220000,android,-100.0,-100.0,0.0,0.0,100,gps,800 +1604156280000,android,-100.000001,-100.000001,0.0,0.0,100,gps,10 +1604156340000,android,-100.000001,-100.0,0.0,0.0,100,gps,10 +1604156400000,android,-100.000001,-100.0,0.0,0.0,100,gps,800 +1604156460000,android,-100.0,-100.0,0.0,0.0,100,gps,10 +1604156520000,android,-99.999999,-100.0,0.0,0.0,100,gps,800 +1604156580000,android,-99.999999,-99.999999,0.0,0.0,100,gps,10 +1604156640000,android,-99.999999,-99.999999,0.0,0.0,100,gps,10 +1604219400000,android,1.0,1.0000120000000001,0.0,0.0,100,gps,10 +1604219460000,android,1.0,1.0000120000000001,0.0,0.0,100,gps,10 +1604219520000,android,1.000001,1.0,0.0,0.0,100,gps,10 +1604219580000,android,1.0,1.0,0.0,0.0,100,gps,10 +1604219640000,android,1.0,1.0,0.0,0.0,100,gps,10 +1604219700000,android,0.999999,1.0,0.0,0.0,100,gps,10 +1604219760000,android,1.0,0.9999879999999999,0.0,0.0,100,gps,10 +1604220420000,android,1.0,0.9999899999999999,0.0,0.0,100,gps,800 +1604248200000,android,100.0,100.0,0.0,0.0,100,gps,800 +1604248260000,android,100.0,100.0,0.0,0.0,100,gps,800 +1604248320000,android,100.000001,100.000001,0.0,0.0,100,gps,10 +1604248380000,android,100.000001,100.0,0.0,0.0,100,gps,10 +1604248440000,android,100.000001,100.0,0.0,0.0,100,gps,800 +1604248500000,android,100.0,100.0,0.0,0.0,100,gps,10 +1604248560000,android,99.999999,100.0,0.0,0.0,100,gps,800 +1604248620000,android,99.999999,99.999999,0.0,0.0,100,gps,10 +1604248680000,android,99.999999,99.999999,0.0,0.0,100,gps,10 diff --git a/tests/data/manual/aware_csv/phone_locations_raw.csv b/tests/data/manual/aware_csv/phone_locations_raw.csv new file mode 100644 index 00000000..396464b1 --- /dev/null +++ b/tests/data/manual/aware_csv/phone_locations_raw.csv @@ -0,0 +1,29 @@ +test_time,device_id,double_latitude,double_longitude,double_bearing,double_speed,double_altitude,provider,accuracy +Sat 10:56:00.000,android,-100,-100,0.0,0.0,100,gps,800 +Sat 10:57:00.000,android,-100,-100,0.0,0.0,100,gps,800 +Sat 10:58:00.000,android,-100.000001,-100.000001,0.0,0.0,100,gps,10 +Sat 10:59:00.000,android,-100.000001,-100,0.0,0.0,100,gps,10 +Sat 11:00:00.000,android,-100.000001,-100,0.0,0.0,100,gps,800 +Sat 11:01:00.000,android,-100,-100,0.0,0.0,100,gps,10 +Sat 11:02:00.000,android,-99.999999,-100,0.0,0.0,100,gps,800 +Sat 11:03:00.000,android,-99.999999,-99.999999,0.0,0.0,100,gps,10 +Sat 11:04:00.000,android,-99.999999,-99.999999,0.0,0.0,100,gps,10 + +Sun 03:30:00.000,android,1,1.000012,0.0,0.0,100,gps,10 +Sun 03:31:00.000,android,1,1.000012,0.0,0.0,100,gps,10 +Sun 03:32:00.000,android,1.000001,1,0.0,0.0,100,gps,10 +Sun 03:33:00.000,android,1,1,0.0,0.0,100,gps,10 +Sun 03:34:00.000,android,1,1,0.0,0.0,100,gps,10 +Sun 03:35:00.000,android,0.999999,1,0.0,0.0,100,gps,10 +Sun 03:36:00.000,android,1,0.999988,0.0,0.0,100,gps,10 +Sun 03:47:00.000,android,1,0.999990,0.0,0.0,100,gps,800 + +Sun 11:30:00.000,android,100,100,0.0,0.0,100,gps,800 +Sun 11:31:00.000,android,100,100,0.0,0.0,100,gps,800 +Sun 11:32:00.000,android,100.000001,100.000001,0.0,0.0,100,gps,10 +Sun 11:33:00.000,android,100.000001,100,0.0,0.0,100,gps,10 +Sun 11:34:00.000,android,100.000001,100,0.0,0.0,100,gps,800 +Sun 11:35:00.000,android,100,100,0.0,0.0,100,gps,10 +Sun 11:36:00.000,android,99.999999,100,0.0,0.0,100,gps,800 +Sun 11:37:00.000,android,99.999999,99.999999,0.0,0.0,100,gps,10 +Sun 11:38:00.000,android,99.999999,99.999999,0.0,0.0,100,gps,10 diff --git a/tests/data/processed/features/mtz_event/android/phone_locations.csv b/tests/data/processed/features/mtz_event/android/phone_locations.csv new file mode 100644 index 00000000..bb605b77 --- /dev/null +++ b/tests/data/processed/features/mtz_event/android/phone_locations.csv @@ -0,0 +1,3 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_loglocationvariance","phone_locations_doryab_totaldistance","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_varspeed","phone_locations_doryab_normalizedlocationentropy","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_timeattop2location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeattop1location","phone_locations_doryab_timeathome","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_avgspeed","phone_locations_doryab_locationvariance" +"beforeMarchEvent#2020-03-07 16:00:00,2020-03-08 15:00:00","beforeMarchEvent","2020-03-07 16:00:00","2020-03-08 15:00:00",1,3.71826910068082,NA,6,NA,0.346573590279973,0,0.693147180559945,6,6,2,6268829.80206745,6,6,1,0,6,NA,NA,5227.19980200003 +"beforeNovemberEvent#2020-10-31 16:00:00,2020-11-01 13:00:00","beforeNovemberEvent","2020-10-31 16:00:00","2020-11-01 13:00:00",1,3.71826910068082,NA,6,NA,0.346573590279973,0,0.693147180559945,6,6,2,6268829.80206745,6,6,1,0,6,NA,NA,5227.19980200003 diff --git a/tests/data/processed/features/mtz_event/empatica/phone_locations.csv b/tests/data/processed/features/mtz_event/empatica/phone_locations.csv new file mode 100644 index 00000000..f7f04854 --- /dev/null +++ b/tests/data/processed/features/mtz_event/empatica/phone_locations.csv @@ -0,0 +1 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop1location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeathome","phone_locations_doryab_totaldistance","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_normalizedlocationentropy" diff --git a/tests/data/processed/features/mtz_event/empty/phone_locations.csv b/tests/data/processed/features/mtz_event/empty/phone_locations.csv new file mode 100644 index 00000000..f7f04854 --- /dev/null +++ b/tests/data/processed/features/mtz_event/empty/phone_locations.csv @@ -0,0 +1 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop1location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeathome","phone_locations_doryab_totaldistance","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_normalizedlocationentropy" diff --git a/tests/data/processed/features/mtz_event/fitbit/phone_locations.csv b/tests/data/processed/features/mtz_event/fitbit/phone_locations.csv new file mode 100644 index 00000000..f7f04854 --- /dev/null +++ b/tests/data/processed/features/mtz_event/fitbit/phone_locations.csv @@ -0,0 +1 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop1location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeathome","phone_locations_doryab_totaldistance","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_normalizedlocationentropy" diff --git a/tests/data/processed/features/mtz_event/ios/phone_locations.csv b/tests/data/processed/features/mtz_event/ios/phone_locations.csv new file mode 100644 index 00000000..f7f04854 --- /dev/null +++ b/tests/data/processed/features/mtz_event/ios/phone_locations.csv @@ -0,0 +1 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop1location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeathome","phone_locations_doryab_totaldistance","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_normalizedlocationentropy" diff --git a/tests/data/processed/features/mtz_frequency/android/phone_locations.csv b/tests/data/processed/features/mtz_frequency/android/phone_locations.csv new file mode 100644 index 00000000..7f5d20cd --- /dev/null +++ b/tests/data/processed/features/mtz_frequency/android/phone_locations.csv @@ -0,0 +1,8 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop3location","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_timeattop1location","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_loglocationvariance","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_timeathome","phone_locations_doryab_locationentropy","phone_locations_doryab_radiusgyration","phone_locations_doryab_totaldistance","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_timeattop2location","phone_locations_doryab_normalizedlocationentropy","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_avgspeed","phone_locations_doryab_varspeed","phone_locations_doryab_minlengthstayatclusters" +"thirtyminutes0017#2020-03-08 08:30:00,2020-03-08 08:59:59","thirtyminutes0017","2020-03-08 08:30:00","2020-03-08 08:59:59",1.58333332533827e-12,0,6,0,0,6,-11.8004276472878,1,NA,0,0,NA,NA,6,0,1,0,NA,NA,6 +"thirtyminutes0017#2020-11-01 08:30:00,2020-11-01 08:59:59","thirtyminutes0017","2020-11-01 08:30:00","2020-11-01 08:59:59",1.58333332533827e-12,0,6,0,0,6,-11.8004276472878,1,NA,0,0,NA,NA,6,0,1,0,NA,NA,6 +"thirtyminutes0021#2020-03-07 10:30:00,2020-03-07 10:59:59","thirtyminutes0021","2020-03-07 10:30:00","2020-03-07 10:59:59",4.99999997475243e-13,3,3,0,0,3,-12.301029997857,1,NA,0,0,NA,NA,0,0,1,0,NA,NA,3 +"thirtyminutes0021#2020-10-31 10:30:00,2020-10-31 10:59:59","thirtyminutes0021","2020-10-31 10:30:00","2020-10-31 10:59:59",4.99999997475243e-13,3,3,0,0,3,-12.301029997857,1,NA,0,0,NA,NA,0,0,1,0,NA,NA,3 +"thirtyminutes0022#2020-03-07 11:00:00,2020-03-07 11:29:59","thirtyminutes0022","2020-03-07 11:00:00","2020-03-07 11:29:59",9.99999994950485e-13,3,3,0,0,3,-12.000000002193,1,NA,0,0,NA,NA,0,0,1,0,NA,NA,3 +"thirtyminutes0022#2020-10-31 11:00:00,2020-10-31 11:29:59","thirtyminutes0022","2020-10-31 11:00:00","2020-10-31 11:29:59",9.99999994950485e-13,3,3,0,0,3,-12.000000002193,1,NA,0,0,NA,NA,0,0,1,0,NA,NA,3 +"thirtyminutes0047#2020-03-07 23:30:00,2020-03-07 23:59:59","thirtyminutes0047","2020-03-07 23:30:00","2020-03-07 23:59:59",3.8799999999484e-11,0,6,0,6,6,-10.4111682744116,1,6,0,0,NA,NA,0,0,1,0,NA,NA,6 diff --git a/tests/data/processed/features/mtz_frequency/empatica/phone_locations.csv b/tests/data/processed/features/mtz_frequency/empatica/phone_locations.csv new file mode 100644 index 00000000..f7f04854 --- /dev/null +++ b/tests/data/processed/features/mtz_frequency/empatica/phone_locations.csv @@ -0,0 +1 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop1location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeathome","phone_locations_doryab_totaldistance","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_normalizedlocationentropy" diff --git a/tests/data/processed/features/mtz_frequency/empty/phone_locations.csv b/tests/data/processed/features/mtz_frequency/empty/phone_locations.csv new file mode 100644 index 00000000..f7f04854 --- /dev/null +++ b/tests/data/processed/features/mtz_frequency/empty/phone_locations.csv @@ -0,0 +1 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop1location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeathome","phone_locations_doryab_totaldistance","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_normalizedlocationentropy" diff --git a/tests/data/processed/features/mtz_frequency/fitbit/phone_locations.csv b/tests/data/processed/features/mtz_frequency/fitbit/phone_locations.csv new file mode 100644 index 00000000..f7f04854 --- /dev/null +++ b/tests/data/processed/features/mtz_frequency/fitbit/phone_locations.csv @@ -0,0 +1 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop1location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeathome","phone_locations_doryab_totaldistance","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_normalizedlocationentropy" diff --git a/tests/data/processed/features/mtz_frequency/ios/phone_locations.csv b/tests/data/processed/features/mtz_frequency/ios/phone_locations.csv new file mode 100644 index 00000000..f7f04854 --- /dev/null +++ b/tests/data/processed/features/mtz_frequency/ios/phone_locations.csv @@ -0,0 +1 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop1location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeathome","phone_locations_doryab_totaldistance","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_normalizedlocationentropy" diff --git a/tests/data/processed/features/mtz_periodic/android/phone_locations.csv b/tests/data/processed/features/mtz_periodic/android/phone_locations.csv new file mode 100644 index 00000000..b9bc500c --- /dev/null +++ b/tests/data/processed/features/mtz_periodic/android/phone_locations.csv @@ -0,0 +1,16 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_timeattop1location","phone_locations_doryab_totaldistance","phone_locations_doryab_timeattop3location","phone_locations_doryab_avgspeed","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_radiusgyration","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_timeathome","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_normalizedlocationentropy","phone_locations_doryab_locationvariance" +"daily#2020-03-07 00:00:00,2020-03-07 23:59:59","daily","2020-03-07 00:00:00","2020-03-07 23:59:59",6,0.693147180559945,NA,6,NA,6,NA,1,6416036.08057409,1,0,NA,3.73564149377632,0,6,2,6,6,0.346573590279973,5440.53356226669 +"daily#2020-03-08 00:00:00,2020-03-08 23:59:59","daily","2020-03-08 00:00:00","2020-03-08 23:59:59",6,0,NA,0,NA,0,NA,0,0,1,6,NA,-11.8004276472878,0,NA,1,6,6,0,1.58333332533827e-12 +"daily#2020-10-31 00:00:00,2020-10-31 23:59:59","daily","2020-10-31 00:00:00","2020-10-31 23:59:59",6,0,NA,0,NA,6,NA,0,0,1,0,NA,-11.8004276472878,0,NA,1,6,6,0,1.58333332533827e-12 +"daily#2020-11-01 00:00:00,2020-11-01 23:59:59","daily","2020-11-01 00:00:00","2020-11-01 23:59:59",6,0.693147180559945,NA,6,NA,0,NA,1,6268829.80206745,1,6,NA,3.71826910068082,0,6,2,6,6,0.346573590279973,5227.19980200003 +"morning#2020-03-07 06:00:00,2020-03-07 11:59:59","morning","2020-03-07 06:00:00","2020-03-07 11:59:59",6,0,NA,0,NA,6,NA,0,0,1,0,NA,-11.8004276472878,0,NA,1,6,6,0,1.58333332533827e-12 +"morning#2020-03-08 06:00:00,2020-03-08 11:59:59","morning","2020-03-08 06:00:00","2020-03-08 11:59:59",6,0,NA,0,NA,0,NA,0,0,1,6,NA,-11.8004276472878,0,NA,1,6,6,0,1.58333332533827e-12 +"morning#2020-10-31 06:00:00,2020-10-31 11:59:59","morning","2020-10-31 06:00:00","2020-10-31 11:59:59",6,0,NA,0,NA,6,NA,0,0,1,0,NA,-11.8004276472878,0,NA,1,6,6,0,1.58333332533827e-12 +"morning#2020-11-01 06:00:00,2020-11-01 11:59:59","morning","2020-11-01 06:00:00","2020-11-01 11:59:59",6,0,NA,0,NA,0,NA,0,0,1,6,NA,-11.8004276472878,0,NA,1,6,6,0,1.58333332533827e-12 +"threeday#2020-03-07 00:00:00,2020-03-09 23:59:59","threeday","2020-03-07 00:00:00","2020-03-09 23:59:59",6,1.09861228866811,NA,6,NA,6,NA,2,8014514.68387131,1,6,NA,4.09019524373105,0,6,3,6,6,0.366204096222703,12308.2198130989 +"threeday#2020-03-08 00:00:00,2020-03-10 23:59:59","threeday","2020-03-08 00:00:00","2020-03-10 23:59:59",6,0,NA,0,NA,0,NA,0,0,1,6,NA,-11.8004276472878,0,NA,1,6,6,0,1.58333332533827e-12 +"threeday#2020-10-29 00:00:00,2020-10-31 23:59:59","threeday","2020-10-29 00:00:00","2020-10-31 23:59:59",6,0,NA,0,NA,6,NA,0,0,1,0,NA,-11.8004276472878,0,NA,1,6,6,0,1.58333332533827e-12 +"threeday#2020-10-30 00:00:00,2020-11-01 23:59:59","threeday","2020-10-30 00:00:00","2020-11-01 23:59:59",6,1.09861228866811,NA,6,NA,6,NA,2,8014514.68387131,1,6,NA,4.09019524373105,0,6,3,6,6,0.366204096222703,12308.2198130989 +"threeday#2020-10-31 00:00:00,2020-11-02 23:59:59","threeday","2020-10-31 00:00:00","2020-11-02 23:59:59",6,1.09861228866811,NA,6,NA,6,NA,2,8014514.68387131,1,6,NA,4.09019524373105,0,6,3,6,6,0.366204096222703,12308.2198130989 +"threeday#2020-11-01 00:00:00,2020-11-03 23:59:59","threeday","2020-11-01 00:00:00","2020-11-03 23:59:59",6,0.693147180559945,NA,6,NA,0,NA,1,6268829.80206745,1,6,NA,3.71826910068082,0,6,2,6,6,0.346573590279973,5227.19980200003 +"weekend#2020-10-30 00:00:00,2020-11-01 23:59:59","weekend","2020-10-30 00:00:00","2020-11-01 23:59:59",6,1.09861228866811,NA,6,NA,6,NA,2,8014514.68387131,1,6,NA,4.09019524373105,0,6,3,6,6,0.366204096222703,12308.2198130989 diff --git a/tests/data/processed/features/mtz_periodic/empatica/phone_locations.csv b/tests/data/processed/features/mtz_periodic/empatica/phone_locations.csv new file mode 100644 index 00000000..f7f04854 --- /dev/null +++ b/tests/data/processed/features/mtz_periodic/empatica/phone_locations.csv @@ -0,0 +1 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop1location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeathome","phone_locations_doryab_totaldistance","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_normalizedlocationentropy" diff --git a/tests/data/processed/features/mtz_periodic/empty/phone_locations.csv b/tests/data/processed/features/mtz_periodic/empty/phone_locations.csv new file mode 100644 index 00000000..f7f04854 --- /dev/null +++ b/tests/data/processed/features/mtz_periodic/empty/phone_locations.csv @@ -0,0 +1 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop1location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeathome","phone_locations_doryab_totaldistance","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_normalizedlocationentropy" diff --git a/tests/data/processed/features/mtz_periodic/fitbit/phone_locations.csv b/tests/data/processed/features/mtz_periodic/fitbit/phone_locations.csv new file mode 100644 index 00000000..f7f04854 --- /dev/null +++ b/tests/data/processed/features/mtz_periodic/fitbit/phone_locations.csv @@ -0,0 +1 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop1location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeathome","phone_locations_doryab_totaldistance","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_normalizedlocationentropy" diff --git a/tests/data/processed/features/mtz_periodic/ios/phone_locations.csv b/tests/data/processed/features/mtz_periodic/ios/phone_locations.csv new file mode 100644 index 00000000..f7f04854 --- /dev/null +++ b/tests/data/processed/features/mtz_periodic/ios/phone_locations.csv @@ -0,0 +1 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop1location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeathome","phone_locations_doryab_totaldistance","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_normalizedlocationentropy" diff --git a/tests/data/processed/features/stz_event/android/phone_locations.csv b/tests/data/processed/features/stz_event/android/phone_locations.csv new file mode 100644 index 00000000..ab7ad896 --- /dev/null +++ b/tests/data/processed/features/stz_event/android/phone_locations.csv @@ -0,0 +1,3 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_loglocationvariance","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_locationvariance","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop2location","phone_locations_doryab_normalizedlocationentropy","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_timeattop3location","phone_locations_doryab_locationentropy","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeattop1location","phone_locations_doryab_totaldistance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_timeathome" +"beforeMarchEvent#2020-03-07 16:00:00,2020-03-08 15:00:00","beforeMarchEvent","2020-03-07 16:00:00","2020-03-08 15:00:00",3.71826910068082,2,1,6,NA,NA,6,5227.19980200003,NA,6,0.346573590279973,1,0,0.693147180559945,6,6268829.80206745,6,NA,0,6 +"beforeNovemberEvent#2020-10-31 16:00:00,2020-11-01 13:00:00","beforeNovemberEvent","2020-10-31 16:00:00","2020-11-01 13:00:00",3.71826910068082,2,1,6,NA,NA,6,5227.19980200003,NA,6,0.346573590279973,1,0,0.693147180559945,6,6268829.80206745,6,NA,0,6 diff --git a/tests/data/processed/features/stz_event/empatica/phone_locations.csv b/tests/data/processed/features/stz_event/empatica/phone_locations.csv new file mode 100644 index 00000000..f7f04854 --- /dev/null +++ b/tests/data/processed/features/stz_event/empatica/phone_locations.csv @@ -0,0 +1 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop1location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeathome","phone_locations_doryab_totaldistance","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_normalizedlocationentropy" diff --git a/tests/data/processed/features/stz_event/empty/phone_locations.csv b/tests/data/processed/features/stz_event/empty/phone_locations.csv new file mode 100644 index 00000000..f7f04854 --- /dev/null +++ b/tests/data/processed/features/stz_event/empty/phone_locations.csv @@ -0,0 +1 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop1location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeathome","phone_locations_doryab_totaldistance","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_normalizedlocationentropy" diff --git a/tests/data/processed/features/stz_event/fitbit/phone_locations.csv b/tests/data/processed/features/stz_event/fitbit/phone_locations.csv new file mode 100644 index 00000000..f7f04854 --- /dev/null +++ b/tests/data/processed/features/stz_event/fitbit/phone_locations.csv @@ -0,0 +1 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop1location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeathome","phone_locations_doryab_totaldistance","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_normalizedlocationentropy" diff --git a/tests/data/processed/features/stz_event/ios/phone_locations.csv b/tests/data/processed/features/stz_event/ios/phone_locations.csv new file mode 100644 index 00000000..f7f04854 --- /dev/null +++ b/tests/data/processed/features/stz_event/ios/phone_locations.csv @@ -0,0 +1 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop1location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeathome","phone_locations_doryab_totaldistance","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_normalizedlocationentropy" diff --git a/tests/data/processed/features/stz_frequency/android/phone_locations.csv b/tests/data/processed/features/stz_frequency/android/phone_locations.csv new file mode 100644 index 00000000..f50bc3ea --- /dev/null +++ b/tests/data/processed/features/stz_frequency/android/phone_locations.csv @@ -0,0 +1,9 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_timeattop1location","phone_locations_doryab_varspeed","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_locationvariance","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_timeathome","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_loglocationvariance","phone_locations_doryab_totaldistance","phone_locations_doryab_radiusgyration","phone_locations_doryab_normalizedlocationentropy","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_timeattop2location","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_timeattop3location" +"thirtyminutes0007#2020-03-08 03:30:00,2020-03-08 03:59:59","thirtyminutes0007","2020-03-08 03:30:00","2020-03-08 03:59:59",6,NA,6,0,3.8799999999484e-11,NA,6,6,1,1,-10.4111682744116,NA,0,0,0,0,0,6,NA,0 +"thirtyminutes0007#2020-11-01 03:30:00,2020-11-01 03:59:59","thirtyminutes0007","2020-11-01 03:30:00","2020-11-01 03:59:59",6,NA,6,0,3.8799999999484e-11,NA,6,6,1,1,-10.4111682744116,NA,0,0,0,0,0,6,NA,0 +"thirtyminutes0021#2020-03-07 10:30:00,2020-03-07 10:59:59","thirtyminutes0021","2020-03-07 10:30:00","2020-03-07 10:59:59",0,NA,3,0,4.99999997475243e-13,NA,3,NA,1,1,-12.301029997857,NA,0,0,0,0,0,3,NA,3 +"thirtyminutes0021#2020-10-31 10:30:00,2020-10-31 10:59:59","thirtyminutes0021","2020-10-31 10:30:00","2020-10-31 10:59:59",0,NA,3,0,4.99999997475243e-13,NA,3,NA,1,1,-12.301029997857,NA,0,0,0,0,0,3,NA,3 +"thirtyminutes0022#2020-03-07 11:00:00,2020-03-07 11:29:59","thirtyminutes0022","2020-03-07 11:00:00","2020-03-07 11:29:59",0,NA,3,0,9.99999994950485e-13,NA,3,NA,1,1,-12.000000002193,NA,0,0,0,0,0,3,NA,3 +"thirtyminutes0022#2020-10-31 11:00:00,2020-10-31 11:29:59","thirtyminutes0022","2020-10-31 11:00:00","2020-10-31 11:29:59",0,NA,3,0,9.99999994950485e-13,NA,3,NA,1,1,-12.000000002193,NA,0,0,0,0,0,3,NA,3 +"thirtyminutes0023#2020-03-08 11:30:00,2020-03-08 11:59:59","thirtyminutes0023","2020-03-08 11:30:00","2020-03-08 11:59:59",0,NA,6,0,1.58333332533827e-12,NA,6,NA,1,1,-11.8004276472878,NA,0,0,0,0,6,6,NA,0 +"thirtyminutes0023#2020-11-01 11:30:00,2020-11-01 11:59:59","thirtyminutes0023","2020-11-01 11:30:00","2020-11-01 11:59:59",0,NA,6,0,1.58333332533827e-12,NA,6,NA,1,1,-11.8004276472878,NA,0,0,0,0,6,6,NA,0 diff --git a/tests/data/processed/features/stz_frequency/empatica/phone_locations.csv b/tests/data/processed/features/stz_frequency/empatica/phone_locations.csv new file mode 100644 index 00000000..f7f04854 --- /dev/null +++ b/tests/data/processed/features/stz_frequency/empatica/phone_locations.csv @@ -0,0 +1 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop1location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeathome","phone_locations_doryab_totaldistance","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_normalizedlocationentropy" diff --git a/tests/data/processed/features/stz_frequency/empty/phone_locations.csv b/tests/data/processed/features/stz_frequency/empty/phone_locations.csv new file mode 100644 index 00000000..f7f04854 --- /dev/null +++ b/tests/data/processed/features/stz_frequency/empty/phone_locations.csv @@ -0,0 +1 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop1location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeathome","phone_locations_doryab_totaldistance","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_normalizedlocationentropy" diff --git a/tests/data/processed/features/stz_frequency/fitbit/phone_locations.csv b/tests/data/processed/features/stz_frequency/fitbit/phone_locations.csv new file mode 100644 index 00000000..f7f04854 --- /dev/null +++ b/tests/data/processed/features/stz_frequency/fitbit/phone_locations.csv @@ -0,0 +1 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop1location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeathome","phone_locations_doryab_totaldistance","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_normalizedlocationentropy" diff --git a/tests/data/processed/features/stz_frequency/ios/phone_locations.csv b/tests/data/processed/features/stz_frequency/ios/phone_locations.csv new file mode 100644 index 00000000..f7f04854 --- /dev/null +++ b/tests/data/processed/features/stz_frequency/ios/phone_locations.csv @@ -0,0 +1 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop1location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeathome","phone_locations_doryab_totaldistance","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_normalizedlocationentropy" diff --git a/tests/data/processed/features/stz_periodic/android/phone_locations.csv b/tests/data/processed/features/stz_periodic/android/phone_locations.csv new file mode 100644 index 00000000..01fa3189 --- /dev/null +++ b/tests/data/processed/features/stz_periodic/android/phone_locations.csv @@ -0,0 +1,16 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_timeathome","phone_locations_doryab_radiusgyration","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_locationvariance","phone_locations_doryab_normalizedlocationentropy","phone_locations_doryab_timeattop2location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_totaldistance","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop3location","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_locationentropy","phone_locations_doryab_loglocationvariance","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_timeattop1location","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_avgspeed" +"daily#2020-03-07 00:00:00,2020-03-07 23:59:59","daily","2020-03-07 00:00:00","2020-03-07 23:59:59",0,NA,0,NA,1.58333332533827e-12,0,0,0,NA,NA,6,6,1,1,0,-11.8004276472878,6,0,6,NA +"daily#2020-03-08 00:00:00,2020-03-08 23:59:59","daily","2020-03-08 00:00:00","2020-03-08 23:59:59",0,6,6268829.80206745,NA,5227.19980200003,0.346573590279973,6,1,NA,NA,0,6,1,2,0.693147180559945,3.71826910068082,6,6,6,NA +"daily#2020-10-31 00:00:00,2020-10-31 23:59:59","daily","2020-10-31 00:00:00","2020-10-31 23:59:59",0,NA,0,NA,1.58333332533827e-12,0,0,0,NA,NA,6,6,1,1,0,-11.8004276472878,6,0,6,NA +"daily#2020-11-01 00:00:00,2020-11-01 23:59:59","daily","2020-11-01 00:00:00","2020-11-01 23:59:59",0,6,6268829.80206745,NA,5227.19980200003,0.346573590279973,6,1,NA,NA,0,6,1,2,0.693147180559945,3.71826910068082,6,6,6,NA +"morning#2020-03-07 06:00:00,2020-03-07 11:59:59","morning","2020-03-07 06:00:00","2020-03-07 11:59:59",0,NA,0,NA,1.58333332533827e-12,0,0,0,NA,NA,6,6,1,1,0,-11.8004276472878,6,0,6,NA +"morning#2020-03-08 06:00:00,2020-03-08 11:59:59","morning","2020-03-08 06:00:00","2020-03-08 11:59:59",0,NA,0,NA,1.58333332533827e-12,0,6,0,NA,NA,0,6,1,1,0,-11.8004276472878,6,0,6,NA +"morning#2020-10-31 06:00:00,2020-10-31 11:59:59","morning","2020-10-31 06:00:00","2020-10-31 11:59:59",0,NA,0,NA,1.58333332533827e-12,0,0,0,NA,NA,6,6,1,1,0,-11.8004276472878,6,0,6,NA +"morning#2020-11-01 06:00:00,2020-11-01 11:59:59","morning","2020-11-01 06:00:00","2020-11-01 11:59:59",0,NA,0,NA,1.58333332533827e-12,0,6,0,NA,NA,0,6,1,1,0,-11.8004276472878,6,0,6,NA +"threeday#2020-03-07 00:00:00,2020-03-09 23:59:59","threeday","2020-03-07 00:00:00","2020-03-09 23:59:59",0,6,8014514.68387131,NA,12308.2198130989,0.366204096222703,6,2,NA,NA,6,6,1,3,1.09861228866811,4.09019524373105,6,6,6,NA +"threeday#2020-03-08 00:00:00,2020-03-10 23:59:59","threeday","2020-03-08 00:00:00","2020-03-10 23:59:59",0,6,6268829.80206745,NA,5227.19980200003,0.346573590279973,6,1,NA,NA,0,6,1,2,0.693147180559945,3.71826910068082,6,6,6,NA +"threeday#2020-10-29 00:00:00,2020-10-31 23:59:59","threeday","2020-10-29 00:00:00","2020-10-31 23:59:59",0,NA,0,NA,1.58333332533827e-12,0,0,0,NA,NA,6,6,1,1,0,-11.8004276472878,6,0,6,NA +"threeday#2020-10-30 00:00:00,2020-11-01 23:59:59","threeday","2020-10-30 00:00:00","2020-11-01 23:59:59",0,6,8014514.68387131,NA,12308.2198130989,0.366204096222703,6,2,NA,NA,6,6,1,3,1.09861228866811,4.09019524373105,6,6,6,NA +"threeday#2020-10-31 00:00:00,2020-11-02 23:59:59","threeday","2020-10-31 00:00:00","2020-11-02 23:59:59",0,6,8014514.68387131,NA,12308.2198130989,0.366204096222703,6,2,NA,NA,6,6,1,3,1.09861228866811,4.09019524373105,6,6,6,NA +"threeday#2020-11-01 00:00:00,2020-11-03 23:59:59","threeday","2020-11-01 00:00:00","2020-11-03 23:59:59",0,6,6268829.80206745,NA,5227.19980200003,0.346573590279973,6,1,NA,NA,0,6,1,2,0.693147180559945,3.71826910068082,6,6,6,NA +"weekend#2020-10-30 00:00:00,2020-11-01 23:59:59","weekend","2020-10-30 00:00:00","2020-11-01 23:59:59",0,6,8014514.68387131,NA,12308.2198130989,0.366204096222703,6,2,NA,NA,6,6,1,3,1.09861228866811,4.09019524373105,6,6,6,NA diff --git a/tests/data/processed/features/stz_periodic/empatica/phone_locations.csv b/tests/data/processed/features/stz_periodic/empatica/phone_locations.csv new file mode 100644 index 00000000..f7f04854 --- /dev/null +++ b/tests/data/processed/features/stz_periodic/empatica/phone_locations.csv @@ -0,0 +1 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop1location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeathome","phone_locations_doryab_totaldistance","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_normalizedlocationentropy" diff --git a/tests/data/processed/features/stz_periodic/empty/phone_locations.csv b/tests/data/processed/features/stz_periodic/empty/phone_locations.csv new file mode 100644 index 00000000..f7f04854 --- /dev/null +++ b/tests/data/processed/features/stz_periodic/empty/phone_locations.csv @@ -0,0 +1 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop1location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeathome","phone_locations_doryab_totaldistance","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_normalizedlocationentropy" diff --git a/tests/data/processed/features/stz_periodic/fitbit/phone_locations.csv b/tests/data/processed/features/stz_periodic/fitbit/phone_locations.csv new file mode 100644 index 00000000..f7f04854 --- /dev/null +++ b/tests/data/processed/features/stz_periodic/fitbit/phone_locations.csv @@ -0,0 +1 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop1location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeathome","phone_locations_doryab_totaldistance","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_normalizedlocationentropy" diff --git a/tests/data/processed/features/stz_periodic/ios/phone_locations.csv b/tests/data/processed/features/stz_periodic/ios/phone_locations.csv new file mode 100644 index 00000000..f7f04854 --- /dev/null +++ b/tests/data/processed/features/stz_periodic/ios/phone_locations.csv @@ -0,0 +1 @@ +"local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime","phone_locations_doryab_avglengthstayatclusters","phone_locations_doryab_outlierstimepercent","phone_locations_doryab_maxlengthstayatclusters","phone_locations_doryab_locationentropy","phone_locations_doryab_locationvariance","phone_locations_doryab_timeattop2location","phone_locations_doryab_varspeed","phone_locations_doryab_timeattop1location","phone_locations_doryab_numberlocationtransitions","phone_locations_doryab_minlengthstayatclusters","phone_locations_doryab_movingtostaticratio","phone_locations_doryab_timeattop3location","phone_locations_doryab_numberofsignificantplaces","phone_locations_doryab_radiusgyration","phone_locations_doryab_timeathome","phone_locations_doryab_totaldistance","phone_locations_doryab_loglocationvariance","phone_locations_doryab_stdlengthstayatclusters","phone_locations_doryab_avgspeed","phone_locations_doryab_normalizedlocationentropy" diff --git a/tests/settings/mtz_event_config.yaml b/tests/settings/mtz_event_config.yaml index 9a286ce9..ad8895b5 100644 --- a/tests/settings/mtz_event_config.yaml +++ b/tests/settings/mtz_event_config.yaml @@ -203,7 +203,7 @@ PHONE_CONVERSATION: # See https://www.rapids.science/latest/features/phone-data-yield/ PHONE_DATA_YIELD: - SENSORS: [] + SENSORS: [PHONE_LOCATIONS] PROVIDERS: RAPIDS: COMPUTE: False @@ -231,30 +231,26 @@ PHONE_LIGHT: # See https://www.rapids.science/latest/features/phone-locations/ PHONE_LOCATIONS: - CONTAINER: locations - LOCATIONS_TO_USE: ALL_RESAMPLED # ALL, GPS, ALL_RESAMPLED, OR FUSED_RESAMPLED + CONTAINER: phone_locations_raw.csv + LOCATIONS_TO_USE: GPS # ALL, GPS, ALL_RESAMPLED, OR FUSED_RESAMPLED FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row - HOME_INFERENCE: - DBSCAN_EPS: 10 # meters - DBSCAN_MINSAMPLES: 5 - THRESHOLD_STATIC : 1 # km/h - CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS PROVIDERS: DORYAB: - COMPUTE: False - FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome"] + COMPUTE: True + FEATURES: ["locationvariance","loglocationvariance","totaldistance","avgspeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","avglengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome"] ACCURACY_LIMIT: 100 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius DBSCAN_EPS: 10 # meters - DBSCAN_MINSAMPLES: 5 + DBSCAN_MINSAMPLES: 3 THRESHOLD_STATIC : 1 # km/h - MAXIMUM_ROW_GAP: 300 - MAXIMUM_ROW_DURATION: 60 + MAXIMUM_ROW_GAP: 300 # seconds MINUTES_DATA_USED: False - CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET,TIME_SEGMENT - CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS - RADIUS_FOR_HOME: 100 + CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET, TIME_SEGMENT, TIME_SEGMENT_INSTANCE + INFER_HOME_LOCATION_STRATEGY: DORYAB_STRATEGY # DORYAB_STRATEGY, SUN_LI_VEGA_STRATEGY + MINIMUM_DAYS_TO_DETECT_HOME_CHANGES: 3 + CLUSTERING_ALGORITHM: DBSCAN # DBSCAN, OPTICS + RADIUS_FOR_HOME: 100 SRC_SCRIPT: src/features/phone_locations/doryab/main.py BARNETT: diff --git a/tests/settings/mtz_frequency_config.yaml b/tests/settings/mtz_frequency_config.yaml index 0c6482cc..ebd73769 100644 --- a/tests/settings/mtz_frequency_config.yaml +++ b/tests/settings/mtz_frequency_config.yaml @@ -203,7 +203,7 @@ PHONE_CONVERSATION: # See https://www.rapids.science/latest/features/phone-data-yield/ PHONE_DATA_YIELD: - SENSORS: [] + SENSORS: [PHONE_LOCATIONS] PROVIDERS: RAPIDS: COMPUTE: False @@ -231,30 +231,26 @@ PHONE_LIGHT: # See https://www.rapids.science/latest/features/phone-locations/ PHONE_LOCATIONS: - CONTAINER: locations - LOCATIONS_TO_USE: ALL_RESAMPLED # ALL, GPS, ALL_RESAMPLED, OR FUSED_RESAMPLED + CONTAINER: phone_locations_raw.csv + LOCATIONS_TO_USE: GPS # ALL, GPS, ALL_RESAMPLED, OR FUSED_RESAMPLED FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row - HOME_INFERENCE: - DBSCAN_EPS: 10 # meters - DBSCAN_MINSAMPLES: 5 - THRESHOLD_STATIC : 1 # km/h - CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS PROVIDERS: DORYAB: - COMPUTE: False - FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome"] + COMPUTE: True + FEATURES: ["locationvariance","loglocationvariance","totaldistance","avgspeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","avglengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome"] ACCURACY_LIMIT: 100 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius DBSCAN_EPS: 10 # meters - DBSCAN_MINSAMPLES: 5 + DBSCAN_MINSAMPLES: 3 THRESHOLD_STATIC : 1 # km/h - MAXIMUM_ROW_GAP: 300 - MAXIMUM_ROW_DURATION: 60 + MAXIMUM_ROW_GAP: 300 # seconds MINUTES_DATA_USED: False - CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET,TIME_SEGMENT - CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS - RADIUS_FOR_HOME: 100 + CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET, TIME_SEGMENT, TIME_SEGMENT_INSTANCE + INFER_HOME_LOCATION_STRATEGY: DORYAB_STRATEGY # DORYAB_STRATEGY, SUN_LI_VEGA_STRATEGY + MINIMUM_DAYS_TO_DETECT_HOME_CHANGES: 3 + CLUSTERING_ALGORITHM: DBSCAN # DBSCAN, OPTICS + RADIUS_FOR_HOME: 100 SRC_SCRIPT: src/features/phone_locations/doryab/main.py BARNETT: diff --git a/tests/settings/mtz_periodic_config.yaml b/tests/settings/mtz_periodic_config.yaml index a492ce7d..56b3ca09 100644 --- a/tests/settings/mtz_periodic_config.yaml +++ b/tests/settings/mtz_periodic_config.yaml @@ -203,7 +203,7 @@ PHONE_CONVERSATION: # See https://www.rapids.science/latest/features/phone-data-yield/ PHONE_DATA_YIELD: - SENSORS: [] + SENSORS: [PHONE_LOCATIONS] PROVIDERS: RAPIDS: COMPUTE: False @@ -231,30 +231,26 @@ PHONE_LIGHT: # See https://www.rapids.science/latest/features/phone-locations/ PHONE_LOCATIONS: - CONTAINER: locations - LOCATIONS_TO_USE: ALL_RESAMPLED # ALL, GPS, ALL_RESAMPLED, OR FUSED_RESAMPLED + CONTAINER: phone_locations_raw.csv + LOCATIONS_TO_USE: GPS # ALL, GPS, ALL_RESAMPLED, OR FUSED_RESAMPLED FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row - HOME_INFERENCE: - DBSCAN_EPS: 10 # meters - DBSCAN_MINSAMPLES: 5 - THRESHOLD_STATIC : 1 # km/h - CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS PROVIDERS: DORYAB: - COMPUTE: False - FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome"] + COMPUTE: True + FEATURES: ["locationvariance","loglocationvariance","totaldistance","avgspeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","avglengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome"] ACCURACY_LIMIT: 100 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius DBSCAN_EPS: 10 # meters - DBSCAN_MINSAMPLES: 5 + DBSCAN_MINSAMPLES: 3 THRESHOLD_STATIC : 1 # km/h - MAXIMUM_ROW_GAP: 300 - MAXIMUM_ROW_DURATION: 60 + MAXIMUM_ROW_GAP: 300 # seconds MINUTES_DATA_USED: False - CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET,TIME_SEGMENT - CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS - RADIUS_FOR_HOME: 100 + CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET, TIME_SEGMENT, TIME_SEGMENT_INSTANCE + INFER_HOME_LOCATION_STRATEGY: DORYAB_STRATEGY # DORYAB_STRATEGY, SUN_LI_VEGA_STRATEGY + MINIMUM_DAYS_TO_DETECT_HOME_CHANGES: 3 + CLUSTERING_ALGORITHM: DBSCAN # DBSCAN, OPTICS + RADIUS_FOR_HOME: 100 SRC_SCRIPT: src/features/phone_locations/doryab/main.py BARNETT: diff --git a/tests/settings/stz_event_config.yaml b/tests/settings/stz_event_config.yaml index 7550821e..4c3bbaa0 100644 --- a/tests/settings/stz_event_config.yaml +++ b/tests/settings/stz_event_config.yaml @@ -203,7 +203,7 @@ PHONE_CONVERSATION: # See https://www.rapids.science/latest/features/phone-data-yield/ PHONE_DATA_YIELD: - SENSORS: [] + SENSORS: [PHONE_LOCATIONS] PROVIDERS: RAPIDS: COMPUTE: False @@ -231,30 +231,26 @@ PHONE_LIGHT: # See https://www.rapids.science/latest/features/phone-locations/ PHONE_LOCATIONS: - CONTAINER: locations - LOCATIONS_TO_USE: ALL_RESAMPLED # ALL, GPS, ALL_RESAMPLED, OR FUSED_RESAMPLED + CONTAINER: phone_locations_raw.csv + LOCATIONS_TO_USE: GPS # ALL, GPS, ALL_RESAMPLED, OR FUSED_RESAMPLED FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row - HOME_INFERENCE: - DBSCAN_EPS: 10 # meters - DBSCAN_MINSAMPLES: 5 - THRESHOLD_STATIC : 1 # km/h - CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS PROVIDERS: DORYAB: - COMPUTE: False - FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome"] + COMPUTE: True + FEATURES: ["locationvariance","loglocationvariance","totaldistance","avgspeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","avglengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome"] ACCURACY_LIMIT: 100 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius DBSCAN_EPS: 10 # meters - DBSCAN_MINSAMPLES: 5 + DBSCAN_MINSAMPLES: 3 THRESHOLD_STATIC : 1 # km/h - MAXIMUM_ROW_GAP: 300 - MAXIMUM_ROW_DURATION: 60 + MAXIMUM_ROW_GAP: 300 # seconds MINUTES_DATA_USED: False - CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET,TIME_SEGMENT - CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS - RADIUS_FOR_HOME: 100 + CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET, TIME_SEGMENT, TIME_SEGMENT_INSTANCE + INFER_HOME_LOCATION_STRATEGY: DORYAB_STRATEGY # DORYAB_STRATEGY, SUN_LI_VEGA_STRATEGY + MINIMUM_DAYS_TO_DETECT_HOME_CHANGES: 3 + CLUSTERING_ALGORITHM: DBSCAN # DBSCAN, OPTICS + RADIUS_FOR_HOME: 100 SRC_SCRIPT: src/features/phone_locations/doryab/main.py BARNETT: diff --git a/tests/settings/stz_frequency_config.yaml b/tests/settings/stz_frequency_config.yaml index 56093b2b..7e20a382 100644 --- a/tests/settings/stz_frequency_config.yaml +++ b/tests/settings/stz_frequency_config.yaml @@ -203,7 +203,7 @@ PHONE_CONVERSATION: # See https://www.rapids.science/latest/features/phone-data-yield/ PHONE_DATA_YIELD: - SENSORS: [] + SENSORS: [PHONE_LOCATIONS] PROVIDERS: RAPIDS: COMPUTE: False @@ -231,30 +231,26 @@ PHONE_LIGHT: # See https://www.rapids.science/latest/features/phone-locations/ PHONE_LOCATIONS: - CONTAINER: locations - LOCATIONS_TO_USE: ALL_RESAMPLED # ALL, GPS, ALL_RESAMPLED, OR FUSED_RESAMPLED + CONTAINER: phone_locations_raw.csv + LOCATIONS_TO_USE: GPS # ALL, GPS, ALL_RESAMPLED, OR FUSED_RESAMPLED FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row - HOME_INFERENCE: - DBSCAN_EPS: 10 # meters - DBSCAN_MINSAMPLES: 5 - THRESHOLD_STATIC : 1 # km/h - CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS PROVIDERS: DORYAB: - COMPUTE: False - FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome"] + COMPUTE: True + FEATURES: ["locationvariance","loglocationvariance","totaldistance","avgspeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","avglengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome"] ACCURACY_LIMIT: 100 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius DBSCAN_EPS: 10 # meters - DBSCAN_MINSAMPLES: 5 + DBSCAN_MINSAMPLES: 3 THRESHOLD_STATIC : 1 # km/h - MAXIMUM_ROW_GAP: 300 - MAXIMUM_ROW_DURATION: 60 + MAXIMUM_ROW_GAP: 300 # seconds MINUTES_DATA_USED: False - CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET,TIME_SEGMENT - CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS - RADIUS_FOR_HOME: 100 + CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET, TIME_SEGMENT, TIME_SEGMENT_INSTANCE + INFER_HOME_LOCATION_STRATEGY: DORYAB_STRATEGY # DORYAB_STRATEGY, SUN_LI_VEGA_STRATEGY + MINIMUM_DAYS_TO_DETECT_HOME_CHANGES: 3 + CLUSTERING_ALGORITHM: DBSCAN # DBSCAN, OPTICS + RADIUS_FOR_HOME: 100 SRC_SCRIPT: src/features/phone_locations/doryab/main.py BARNETT: diff --git a/tests/settings/stz_periodic_config.yaml b/tests/settings/stz_periodic_config.yaml index d6d47d88..cc3429eb 100644 --- a/tests/settings/stz_periodic_config.yaml +++ b/tests/settings/stz_periodic_config.yaml @@ -203,7 +203,7 @@ PHONE_CONVERSATION: # See https://www.rapids.science/latest/features/phone-data-yield/ PHONE_DATA_YIELD: - SENSORS: [] + SENSORS: [PHONE_LOCATIONS] PROVIDERS: RAPIDS: COMPUTE: False @@ -231,30 +231,26 @@ PHONE_LIGHT: # See https://www.rapids.science/latest/features/phone-locations/ PHONE_LOCATIONS: - CONTAINER: locations - LOCATIONS_TO_USE: ALL_RESAMPLED # ALL, GPS, ALL_RESAMPLED, OR FUSED_RESAMPLED + CONTAINER: phone_locations_raw.csv + LOCATIONS_TO_USE: GPS # ALL, GPS, ALL_RESAMPLED, OR FUSED_RESAMPLED FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row - HOME_INFERENCE: - DBSCAN_EPS: 10 # meters - DBSCAN_MINSAMPLES: 5 - THRESHOLD_STATIC : 1 # km/h - CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS PROVIDERS: DORYAB: - COMPUTE: False - FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome"] + COMPUTE: True + FEATURES: ["locationvariance","loglocationvariance","totaldistance","avgspeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","avglengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome"] ACCURACY_LIMIT: 100 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius DBSCAN_EPS: 10 # meters - DBSCAN_MINSAMPLES: 5 + DBSCAN_MINSAMPLES: 3 THRESHOLD_STATIC : 1 # km/h - MAXIMUM_ROW_GAP: 300 - MAXIMUM_ROW_DURATION: 60 + MAXIMUM_ROW_GAP: 300 # seconds MINUTES_DATA_USED: False - CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET,TIME_SEGMENT - CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS - RADIUS_FOR_HOME: 100 + CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET, TIME_SEGMENT, TIME_SEGMENT_INSTANCE + INFER_HOME_LOCATION_STRATEGY: DORYAB_STRATEGY # DORYAB_STRATEGY, SUN_LI_VEGA_STRATEGY + MINIMUM_DAYS_TO_DETECT_HOME_CHANGES: 3 + CLUSTERING_ALGORITHM: DBSCAN # DBSCAN, OPTICS + RADIUS_FOR_HOME: 100 SRC_SCRIPT: src/features/phone_locations/doryab/main.py BARNETT: diff --git a/tools/config.schema.yaml b/tools/config.schema.yaml index fac0301d..01e7ec02 100644 --- a/tools/config.schema.yaml +++ b/tools/config.schema.yaml @@ -612,22 +612,6 @@ properties: FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: type: integer exclusiveMinimum: 0 - HOME_INFERENCE: - type: object - required: [DBSCAN_EPS, DBSCAN_MINSAMPLES, THRESHOLD_STATIC, CLUSTERING_ALGORITHM] - properties: - DBSCAN_EPS: - type: integer - exclusiveMinimum: 0 - DBSCAN_MINSAMPLES: - type: integer - exclusiveMinimum: 0 - THRESHOLD_STATIC: - type: integer - exclusiveMinimum: 0 - CLUSTERING_ALGORITHM: - type: string - enum: ["DBSCAN", "OPTICS"] PROVIDERS: type: ["null", object] properties: @@ -640,7 +624,7 @@ properties: uniqueItems: True items: type: string - enum: [locationvariance,loglocationvariance,totaldistance,averagespeed,varspeed,circadianmovement,numberofsignificantplaces,numberlocationtransitions,radiusgyration,timeattop1location,timeattop2location,timeattop3location,movingtostaticratio,outlierstimepercent,maxlengthstayatclusters,minlengthstayatclusters,meanlengthstayatclusters,stdlengthstayatclusters,locationentropy,normalizedlocationentropy,timeathome] + enum: [locationvariance,loglocationvariance,totaldistance,avgspeed,varspeed,numberofsignificantplaces,numberlocationtransitions,radiusgyration,timeattop1location,timeattop2location,timeattop3location,movingtostaticratio,outlierstimepercent,maxlengthstayatclusters,minlengthstayatclusters,avglengthstayatclusters,stdlengthstayatclusters,locationentropy,normalizedlocationentropy,timeathome,homelabel] ACCURACY_LIMIT: type: integer exclusiveMinimum: 0 @@ -656,17 +640,23 @@ properties: MAXIMUM_ROW_GAP: type: integer exclusiveMinimum: 0 - MAXIMUM_ROW_DURATION: - type: integer - exclusiveMinimum: 0 MINUTES_DATA_USED: type: boolean CLUSTER_ON: type: string - enum: ["PARTICIPANT_DATASET", "TIME_SEGMENT"] + enum: ["PARTICIPANT_DATASET", "TIME_SEGMENT", "TIME_SEGMENT_INSTANCE"] + INFER_HOME_LOCATION_STRATEGY: + type: string + enum: ["DORYAB_STRATEGY", "SUN_LI_VEGA_STRATEGY"] + MINIMUM_DAYS_TO_DETECT_HOME_CHANGES: + type: integer + minimum: 0 CLUSTERING_ALGORITHM: type: string enum: ["DBSCAN", "OPTICS"] + RADIUS_FOR_HOME: + type: integer + exclusiveMinimum: 0 BARNETT: allOf: