From 266dd28d02d844351c8bd003a2c33cb9b69b26b6 Mon Sep 17 00:00:00 2001 From: JulioV Date: Fri, 11 Dec 2020 12:03:22 -0500 Subject: [PATCH 1/5] Add bluetooth doryab features --- config.yaml | 8 ++ docs/citation.md | 7 ++ docs/features/phone-bluetooth.md | 74 ++++++++++++++ src/features/phone_bluetooth/doryab/main.py | 106 ++++++++++++++++++++ 4 files changed, 195 insertions(+) create mode 100644 src/features/phone_bluetooth/doryab/main.py diff --git a/config.yaml b/config.yaml index 014881b4..4c92e850 100644 --- a/config.yaml +++ b/config.yaml @@ -127,6 +127,14 @@ PHONE_BLUETOOTH: FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] SRC_FOLDER: "rapids" # inside src/features/phone_bluetooth SRC_LANGUAGE: "r" + DORYAB: + COMPUTE: False + FEATURES: + ALL: ["countscans", "uniquedevices", "countscansmostuniquedevice", "countscansleastuniquedevice", "meanscans", "stdscans"] + OWN: ["countscans", "uniquedevices", "countscansmostuniquedevice", "countscansleastuniquedevice", "meanscans", "stdscans"] + OTHERS: ["countscans", "uniquedevices", "countscansmostuniquedevice", "countscansleastuniquedevice", "meanscans", "stdscans"] + SRC_FOLDER: "doryab" # inside src/features/phone_bluetooth + SRC_LANGUAGE: "python" # See https://www.rapids.science/latest/features/phone-calls/ PHONE_CALLS: diff --git a/docs/citation.md b/docs/citation.md index 30d5988f..94cae21d 100644 --- a/docs/citation.md +++ b/docs/citation.md @@ -28,6 +28,13 @@ If you computed applications foreground features using the app category (genre) !!! cite "Stachl et al. citation" Clemens Stachl, Quay Au, Ramona Schoedel, Samuel D. Gosling, Gabriella M. Harari, Daniel Buschek, Sarah Theres Völkel, Tobias Schuwerk, Michelle Oldemeier, Theresa Ullmann, Heinrich Hussmann, Bernd Bischl, Markus Bühner. Proceedings of the National Academy of Sciences Jul 2020, 117 (30) 17680-17687; DOI: 10.1073/pnas.1920484117 +## Doryab (bluetooth) + +If you computed bluetooth features using the provider `[PHONE_BLUETOOTH][DORYAB]` cite [this paper](https://arxiv.org/abs/1812.10394) in addition to RAPIDS. + +!!! cite "Doryab et al. citation" + Doryab, A., Chikarsel, P., Liu, X., & Dey, A. K. (2019). Extraction of Behavioral Features from Smartphone and Wearable Data. ArXiv:1812.10394 [Cs, Stat]. http://arxiv.org/abs/1812.10394 + ## Barnett (locations) If you computed locations features using the provider `[PHONE_LOCATIONS][BARNETT]` cite [this paper](https://doi.org/10.1093/biostatistics/kxy059) and [this paper](https://doi.org/10.1145/2750858.2805845) in addition to RAPIDS. diff --git a/docs/features/phone-bluetooth.md b/docs/features/phone-bluetooth.md index e37b2628..57219572 100644 --- a/docs/features/phone-bluetooth.md +++ b/docs/features/phone-bluetooth.md @@ -39,3 +39,77 @@ Features description for `[PHONE_BLUETOOTH][PROVIDERS][RAPIDS]`: !!! note "Assumptions/Observations" NA + +## DORYAB provider + +!!! info "Available time segments and platforms" + - Available for all time segments + - Available for Android only + +!!! info "File Sequence" + ```bash + - data/raw/{pid}/phone_bluetooth_raw.csv + - data/raw/{pid}/phone_bluetooth_with_datetime.csv + - data/interim/{pid}/phone_bluetooth_features/phone_bluetooth_{language}_{provider_key}.csv + - data/processed/features/{pid}/phone_bluetooth.csv" + ``` + + +Parameters description for `[PHONE_BLUETOOTH][PROVIDERS][DORYAB]`: + +|Key                              | Description | +|----------------|----------------------------------------------------------------------------------------------------------------------------------- +|`[COMPUTE]`| Set to `True` to extract `PHONE_BLUETOOTH` features from the `DORYAB` provider| +|`[FEATURES]` | Features to be computed, see table below. These features are computed for three device categories: `all` devices, `own` devices and `other` devices. + + +Features description for `[PHONE_BLUETOOTH][PROVIDERS][DORYAB]`: + +|Feature |Units |Description| +|-------------------------- |---------- |---------------------------| +| countscans | scans | Number of scans (rows) from the devices sensed during a time segment instance. The more scans a bluetooth device has the longer it remained within range of the participant's phone | +| uniquedevices | devices | Number of unique bluetooth devices sensed during a time segment instance as identified by their hardware addresses (`bt_address`) | +| countscansmostuniquedevice | scans | Number of scans of the most sensed device within each time segment instance| +| countscansleastuniquedevice| scans| Number of scans of the least sensed device within each time segment instance | +| meanscans | scans| Mean of the scans of every sensed device within each time segment instance| +| stdscans | scans| Standard deviation of the scans of every sensed device within each time segment instance| + +!!! note "Assumptions/Observations" + - This provider is adapted from the work by [Doryab et al](../../citation#doryab-bluetooth). Devices are clasified as belonging to the participant (`own`) or to other people (`others`) using k-means based on the number of times and the number of days each device was detected across each participant's dataset. + - If ownership cannot be computed because all devices were detected on only one day, they are all considered as `other`. Thus `all` and `other` features will be equal. + - These features are computed for devices detected within each time segment instance. For example, let's say that we logged the following devices on three different time segment instances (days) for `p01`: + ```csv + local_date bt_address + 2016-11-29 55C836F5-487E-405F-8E28-21DBD40FA4FF + 2016-11-29 55C836F5-487E-405F-8E28-21DBD40FA4FF + 2016-11-29 55C836F5-487E-405F-8E28-21DBD40FA4FF + 2016-11-29 48872A52-68DE-420D-98DA-73339A1C4685 + 2016-11-29 48872A52-68DE-420D-98DA-73339A1C4685 + 2016-11-30 55C836F5-487E-405F-8E28-21DBD40FA4FF + 2016-11-30 55C836F5-487E-405F-8E28-21DBD40FA4FF + 2016-11-30 48872A52-68DE-420D-98DA-73339A1C4685 + 2017-05-07 5C5A9C41-2F68-4CEB-96D0-77DE3729B729 + 2017-05-07 25262DC7-780C-4AD5-AD3A-D9776AEF7FC1 + 2017-05-07 5B1E6981-2E50-4D9A-99D8-67AED430C5A8 + 2017-05-07 6C444841-FE64-4375-BC3F-FA410CDC0AC7 + 2017-05-07 5B1E6981-2E50-4D9A-99D8-67AED430C5A8 + 2017-05-07 4DC7A22D-9F1F-4DEF-8576-086910AABCB5 + ``` + - For each device we compute `days_scanned` (the number of days on which each device was detected), `scans` (the number of times each device was detected), `scans_per_day` that's equal to `scans/days_scanned`, and whether a devices is labelled as `own` or `other` (note the last device is labelled as a `own` device because it was detected 6 times over two time segment instances): + ```csv + bt_address days_scanned scans scans_per_day own_device + 25262DC7-780C-4AD5-AD3A-D9776AEF7FC1 1 1 1.0 0 + 4DC7A22D-9F1F-4DEF-8576-086910AABCB5 1 1 1.0 0 + 5C5A9C41-2F68-4CEB-96D0-77DE3729B729 1 1 1.0 0 + 6C444841-FE64-4375-BC3F-FA410CDC0AC7 1 1 1.0 0 + 5B1E6981-2E50-4D9A-99D8-67AED430C5A8 1 2 2.0 0 + 48872A52-68DE-420D-98DA-73339A1C4685 2 3 1.5 0 + 55C836F5-487E-405F-8E28-21DBD40FA4FF 2 5 2.5 1 + ``` + - These are the metrics for each time instance (day) for `own` and `other` devices (we ignore `all` for brevity). The only `own` device (`55C836F5-487E-405F-8E28-21DBD40FA4FF`) was detected on the first two days, 3 and 2 times respectively, the `other` devices where detected on all three days. On the last day (`2017-05-07`) there were 6 scans from 5 unique devices, the most frequent device for that day was `5B1E6981-2E50-4D9A-99D8-67AED430C5A8` with 2 scans, and the mean number of scans among all devices was 1.2 (`[1 + 1 + 1 + 1 + 2] / 5`) + ```csv + local_segment countscansown uniquedevicesown countscansmostuniquedeviceown countscansleastuniquedeviceown meanscansown stdscansown countscansothers uniquedevicesothers countscansmostuniquedeviceothers countscansleastuniquedeviceothers meanscansothers stdscansothers + 2016-11-29 3.0 1.0 3.0 3.0 3.0 NaN 2 1 2 2 2.0 NaN + 2016-11-30 2.0 1.0 2.0 2.0 2.0 NaN 1 1 1 1 1.0 NaN + 2017-05-07 NaN NaN NaN NaN NaN NaN 6 5 2 1 1.2 0.447214 + ``` diff --git a/src/features/phone_bluetooth/doryab/main.py b/src/features/phone_bluetooth/doryab/main.py new file mode 100644 index 00000000..32b8747f --- /dev/null +++ b/src/features/phone_bluetooth/doryab/main.py @@ -0,0 +1,106 @@ +import pandas as pd +import numpy as np +from sklearn.cluster import KMeans + +def deviceFeatures(devices, ownership, features_to_compute, features): + if devices.shape[0] == 0: + device_value_counts = pd.DataFrame(columns=["local_segment", "bt_address", "scans"], dtype=int) + else: + device_value_counts = devices.groupby(["local_segment"])["bt_address"].value_counts().to_frame("scans").reset_index() + + if "countscans" in features_to_compute: + features = features.join(device_value_counts.groupby("local_segment")["scans"].sum().to_frame("countscans" + ownership), how="outer") + if "uniquedevices" in features_to_compute: + features = features.join(device_value_counts.groupby("local_segment")["bt_address"].nunique().to_frame("uniquedevices" + ownership), how="outer") + if "countscansmostuniquedevice" in features_to_compute: + features = features.join(device_value_counts.groupby("local_segment")["scans"].max().to_frame("countscansmostuniquedevice" + ownership), how="outer") + if "countscansleastuniquedevice" in features_to_compute: + features = features.join(device_value_counts.groupby("local_segment")["scans"].min().to_frame("countscansleastuniquedevice" + ownership), how="outer") + if "meanscans" in features_to_compute: + features = features.join(device_value_counts.groupby("local_segment")["scans"].mean().to_frame("meanscans" + ownership), how="outer") + if "stdscans" in features_to_compute: + features = features.join(device_value_counts.groupby("local_segment")["scans"].std().to_frame("stdscans" + ownership), how="outer") + return(features) + +def deviceFrequency(bt_data): + bt_data = bt_data[["local_date", "bt_address"]].dropna(subset=["bt_address"]) + bt_data = bt_data.groupby("bt_address").agg({"local_date": pd.Series.nunique, "bt_address" : 'count'}) + bt_data = bt_data.rename(columns={"local_date" : "days_scanned", "bt_address" : "scans"}) + bt_data["scans_per_day"] = bt_data["scans"] / bt_data["days_scanned"] + return bt_data + +def ownership_based_on_clustering(bt_frequency): + bt_frequency = bt_frequency.reset_index() + for col in ["scans_per_day", "days_scanned", "scans"]: + col_zscore = col + '_z' + bt_frequency[col_zscore] = (bt_frequency[col] - bt_frequency[col].mean()) / bt_frequency[col].std(ddof=0) + + bt_frequency = bt_frequency.dropna(how='any') + if len(bt_frequency) == 0: + bt_frequency["own_device"] = None + return bt_frequency[["bt_address", "own_device"]] + + avgfreq_z = bt_frequency["scans_per_day_z"] + numdays_z = bt_frequency["days_scanned_z"] + score = avgfreq_z + numdays_z + maxscore = np.max(score) + minscore = np.min(score) + midscore = (maxscore + minscore) / 2 + initial_k2 = np.array([[maxscore], [minscore]], np.int32) + initial_k3 = np.array([[maxscore], [midscore], [minscore]], np.int32) + X_array = score.values + X = np.reshape(X_array, (len(score), 1)) + + # K = 2, devices I own VS devices other people own + kmeans_k2 = KMeans(n_clusters=2, init = initial_k2, n_init = 1).fit(X) + labels_k2 = kmeans_k2.labels_ + centers_k2 = [c[0] for c in kmeans_k2.cluster_centers_] + diff_k2 = [(X_array[xi] - centers_k2[labels_k2[xi]])**2 for xi in range(0, len(X_array))] + sum_dist_k2 = sum(diff_k2) + + # K = 3, devices I own VS devices my partner/roommate owns (can also be other devices I own though) VS devices other people own + kmeans_k3 = KMeans(n_clusters=3, init=initial_k3, n_init = 1).fit(X) + labels_k3 = kmeans_k3.labels_ + centers_k3 = [c[0] for c in kmeans_k3.cluster_centers_] + diff_k3 = [(X_array[xi] - centers_k3[labels_k3[xi]])**2 for xi in range(0, len(X_array))] + sum_dist_k3 = sum(diff_k3) + + if sum_dist_k2 < sum_dist_k3: # K = 2 is better + labels = labels_k2 + centers = centers_k2 + numclust = 2 + else: + labels = labels_k3 + centers = centers_k3 + numclust = 3 + + maxcluster = np.where(labels == np.argmax(centers), 1, 0) + bt_frequency["own_device"] = maxcluster + return bt_frequency[["bt_address", "own_device"]] + + +def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): + + bt_data = pd.read_csv(sensor_data_files["sensor_data"]) + base_features = set(["countscans", "uniquedevices", "countscansmostuniquedevice", "countscansleastuniquedevice", "meanscans", "stdscans"]) + ownership_keys = [x.lower() for x in provider["FEATURES"].keys()] + if set(ownership_keys) != set(["own", "others", "all"]): + raise ValueError("[PHONE_BLUETOOTH][DORYAB][FEATURES] config key can only have three lists called ALL, OWN and OTHERS, instead you provided {}".format(ownership_keys)) + + device_ownership = ownership_based_on_clustering(deviceFrequency(bt_data)).set_index("bt_address") + bt_data = bt_data.set_index("bt_address").join(device_ownership, how="left").reset_index() + bt_data["own_device"].fillna(0, inplace=True) + segment_bt_data = filter_data_by_segment(bt_data, time_segment) + features = pd.DataFrame(columns=['local_segment']).set_index("local_segment") + for ownership in provider["FEATURES"].keys(): + features_to_compute = list(set(provider["FEATURES"][ownership]) & base_features) + if ownership == "OWN": + owner_segment_bt_data = segment_bt_data.query("own_device == 1") + elif ownership == "OTHERS": + owner_segment_bt_data = segment_bt_data.query("own_device == 0") + else: #ALL + owner_segment_bt_data = segment_bt_data + features = deviceFeatures(owner_segment_bt_data, ownership.lower(), features_to_compute, features) + + features = features.reset_index() + return features From 5bd1bfe85638f20c84961f60a7e5f0a96b05e1a5 Mon Sep 17 00:00:00 2001 From: JulioV Date: Sat, 12 Dec 2020 17:01:46 -0500 Subject: [PATCH 2/5] Add new bluetooth doryab features and deprecate rapids provider --- config.yaml | 17 ++- docs/features/phone-bluetooth.md | 135 +++++++++++++------- src/features/phone_bluetooth/doryab/main.py | 74 +++++++++-- 3 files changed, 163 insertions(+), 63 deletions(-) diff --git a/config.yaml b/config.yaml index 4c92e850..b216b5d1 100644 --- a/config.yaml +++ b/config.yaml @@ -128,11 +128,20 @@ PHONE_BLUETOOTH: SRC_FOLDER: "rapids" # inside src/features/phone_bluetooth SRC_LANGUAGE: "r" DORYAB: - COMPUTE: False + COMPUTE: FALSE FEATURES: - ALL: ["countscans", "uniquedevices", "countscansmostuniquedevice", "countscansleastuniquedevice", "meanscans", "stdscans"] - OWN: ["countscans", "uniquedevices", "countscansmostuniquedevice", "countscansleastuniquedevice", "meanscans", "stdscans"] - OTHERS: ["countscans", "uniquedevices", "countscansmostuniquedevice", "countscansleastuniquedevice", "meanscans", "stdscans"] + ALL: + DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"] + SCANS_MOST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"] + SCANS_LEAST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"] + OWN: + DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"] + SCANS_MOST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"] + SCANS_LEAST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"] + OTHERS: + DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"] + SCANS_MOST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"] + SCANS_LEAST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"] SRC_FOLDER: "doryab" # inside src/features/phone_bluetooth SRC_LANGUAGE: "python" diff --git a/docs/features/phone-bluetooth.md b/docs/features/phone-bluetooth.md index 57219572..765676e3 100644 --- a/docs/features/phone-bluetooth.md +++ b/docs/features/phone-bluetooth.md @@ -8,6 +8,9 @@ Sensor parameters description for `[PHONE_BLUETOOTH]`: ## RAPIDS provider +!!! warning + The features of this provider are deprecated in favor of `DORYAB` provider (see below). + !!! info "Available time segments and platforms" - Available for all time segments - Available for Android only @@ -33,14 +36,15 @@ Features description for `[PHONE_BLUETOOTH][PROVIDERS][RAPIDS]`: |Feature |Units |Description| |-------------------------- |---------- |---------------------------| -| countscans | devices | Number of scanned devices during a `time_segment`, a device can be detected multiple times over time and these appearances are counted separately | -| uniquedevices | devices | Number of unique devices during a `time_segment` as identified by their hardware (`bt_address`) address | -| countscansmostuniquedevice | scans | Number of scans of the most scanned device during a `time_segment` across the whole monitoring period | +| {--countscans--} | devices | Number of scanned devices during a time segment, a device can be detected multiple times over time and these appearances are counted separately | +| {--uniquedevices--} | devices | Number of unique devices during a time segment as identified by their hardware (`bt_address`) address | +| {--countscansmostuniquedevice--} | scans | Number of scans of the most sensed device within each time segment instance | !!! note "Assumptions/Observations" - NA + - From `v0.2.0` `countscans`, `uniquedevices`, `countscansmostuniquedevice` were deprecated because they overlap with the respective features for `ALL` devices of the `PHONE_BLUETOOTH` `DORYAB` provider ## DORYAB provider +This provider is adapted from the work by [Doryab et al](../../citation#doryab-bluetooth). !!! info "Available time segments and platforms" - Available for all time segments @@ -65,51 +69,92 @@ Parameters description for `[PHONE_BLUETOOTH][PROVIDERS][DORYAB]`: Features description for `[PHONE_BLUETOOTH][PROVIDERS][DORYAB]`: -|Feature |Units |Description| +|Feature                                                                                   |Units |Description| |-------------------------- |---------- |---------------------------| | countscans | scans | Number of scans (rows) from the devices sensed during a time segment instance. The more scans a bluetooth device has the longer it remained within range of the participant's phone | | uniquedevices | devices | Number of unique bluetooth devices sensed during a time segment instance as identified by their hardware addresses (`bt_address`) | -| countscansmostuniquedevice | scans | Number of scans of the most sensed device within each time segment instance| -| countscansleastuniquedevice| scans| Number of scans of the least sensed device within each time segment instance | | meanscans | scans| Mean of the scans of every sensed device within each time segment instance| | stdscans | scans| Standard deviation of the scans of every sensed device within each time segment instance| +| countscans{==most==}frequentdevice{==within==}segments | scans | Number of scans of the **most** sensed device **within** each time segment instance| +| countscans{==least==}frequentdevice{==within==}segments| scans| Number of scans of the **least** sensed device **within** each time segment instance | +| countscans{==most==}frequentdevice{==across==}segments | scans | Number of scans of the **most** sensed device **across** time segment instances of the same type| +| countscans{==least==}frequentdevice{==across==}segments| scans| Number of scans of the **least** sensed device **across** time segment instances of the same type per device| +| countscans{==most==}frequentdevice{==acrossdataset==} | scans | Number of scans of the **most** sensed device **across** the entire dataset of every participant| +| countscans{==least==}frequentdevice{==acrossdataset==}| scans| Number of scans of the **least** sensed device **across** the entire dataset of every participant | + !!! note "Assumptions/Observations" - - This provider is adapted from the work by [Doryab et al](../../citation#doryab-bluetooth). Devices are clasified as belonging to the participant (`own`) or to other people (`others`) using k-means based on the number of times and the number of days each device was detected across each participant's dataset. - - If ownership cannot be computed because all devices were detected on only one day, they are all considered as `other`. Thus `all` and `other` features will be equal. - - These features are computed for devices detected within each time segment instance. For example, let's say that we logged the following devices on three different time segment instances (days) for `p01`: - ```csv - local_date bt_address - 2016-11-29 55C836F5-487E-405F-8E28-21DBD40FA4FF - 2016-11-29 55C836F5-487E-405F-8E28-21DBD40FA4FF - 2016-11-29 55C836F5-487E-405F-8E28-21DBD40FA4FF - 2016-11-29 48872A52-68DE-420D-98DA-73339A1C4685 - 2016-11-29 48872A52-68DE-420D-98DA-73339A1C4685 - 2016-11-30 55C836F5-487E-405F-8E28-21DBD40FA4FF - 2016-11-30 55C836F5-487E-405F-8E28-21DBD40FA4FF - 2016-11-30 48872A52-68DE-420D-98DA-73339A1C4685 - 2017-05-07 5C5A9C41-2F68-4CEB-96D0-77DE3729B729 - 2017-05-07 25262DC7-780C-4AD5-AD3A-D9776AEF7FC1 - 2017-05-07 5B1E6981-2E50-4D9A-99D8-67AED430C5A8 - 2017-05-07 6C444841-FE64-4375-BC3F-FA410CDC0AC7 - 2017-05-07 5B1E6981-2E50-4D9A-99D8-67AED430C5A8 - 2017-05-07 4DC7A22D-9F1F-4DEF-8576-086910AABCB5 - ``` - - For each device we compute `days_scanned` (the number of days on which each device was detected), `scans` (the number of times each device was detected), `scans_per_day` that's equal to `scans/days_scanned`, and whether a devices is labelled as `own` or `other` (note the last device is labelled as a `own` device because it was detected 6 times over two time segment instances): - ```csv - bt_address days_scanned scans scans_per_day own_device - 25262DC7-780C-4AD5-AD3A-D9776AEF7FC1 1 1 1.0 0 - 4DC7A22D-9F1F-4DEF-8576-086910AABCB5 1 1 1.0 0 - 5C5A9C41-2F68-4CEB-96D0-77DE3729B729 1 1 1.0 0 - 6C444841-FE64-4375-BC3F-FA410CDC0AC7 1 1 1.0 0 - 5B1E6981-2E50-4D9A-99D8-67AED430C5A8 1 2 2.0 0 - 48872A52-68DE-420D-98DA-73339A1C4685 2 3 1.5 0 - 55C836F5-487E-405F-8E28-21DBD40FA4FF 2 5 2.5 1 - ``` - - These are the metrics for each time instance (day) for `own` and `other` devices (we ignore `all` for brevity). The only `own` device (`55C836F5-487E-405F-8E28-21DBD40FA4FF`) was detected on the first two days, 3 and 2 times respectively, the `other` devices where detected on all three days. On the last day (`2017-05-07`) there were 6 scans from 5 unique devices, the most frequent device for that day was `5B1E6981-2E50-4D9A-99D8-67AED430C5A8` with 2 scans, and the mean number of scans among all devices was 1.2 (`[1 + 1 + 1 + 1 + 2] / 5`) - ```csv - local_segment countscansown uniquedevicesown countscansmostuniquedeviceown countscansleastuniquedeviceown meanscansown stdscansown countscansothers uniquedevicesothers countscansmostuniquedeviceothers countscansleastuniquedeviceothers meanscansothers stdscansothers - 2016-11-29 3.0 1.0 3.0 3.0 3.0 NaN 2 1 2 2 2.0 NaN - 2016-11-30 2.0 1.0 2.0 2.0 2.0 NaN 1 1 1 1 1.0 NaN - 2017-05-07 NaN NaN NaN NaN NaN NaN 6 5 2 1 1.2 0.447214 - ``` + - Devices are classified as belonging to the participant (`own`) or to other people (`others`) using k-means based on the number of times and the number of days each device was detected across each participant's dataset. See [Doryab et al](../../citation#doryab-bluetooth) for more details. + - If ownership cannot be computed because all devices were detected on only one day, they are all considered as `other`. Thus `all` and `other` features will be equal. The likelihood of this scenario decreases the more days of data you have. + - The most and least frequent devices will be the same across time segment instances and across the entire dataset when every time segment instance covers every hour of a dataset. For example, daily segments (00:00 to 23:59) fall in this category but morning segments (06:00am to 11:59am) or periodic 30-minute segments don't. + + ??? info "Example" + + ??? example "Simplified raw bluetooth data" + The following is a simplified example with bluetooth data from three days and two time segments: morning and afternoon. There are two `own` devices: `5C836F5-487E-405F-8E28-21DBD40FA4FF` detected seven times across two days and `499A1EAF-DDF1-4657-986C-EA5032104448` detected eight times on a single day. + ```csv + local_date segment bt_address own_device + 2016-11-29 morning 55C836F5-487E-405F-8E28-21DBD40FA4FF 1 + 2016-11-29 morning 55C836F5-487E-405F-8E28-21DBD40FA4FF 1 + 2016-11-29 morning 55C836F5-487E-405F-8E28-21DBD40FA4FF 1 + 2016-11-29 morning 55C836F5-487E-405F-8E28-21DBD40FA4FF 1 + 2016-11-29 morning 48872A52-68DE-420D-98DA-73339A1C4685 0 + 2016-11-29 afternoon 55C836F5-487E-405F-8E28-21DBD40FA4FF 1 + 2016-11-29 afternoon 48872A52-68DE-420D-98DA-73339A1C4685 0 + 2016-11-30 morning 55C836F5-487E-405F-8E28-21DBD40FA4FF 1 + 2016-11-30 morning 48872A52-68DE-420D-98DA-73339A1C4685 0 + 2016-11-30 morning 25262DC7-780C-4AD5-AD3A-D9776AEF7FC1 0 + 2016-11-30 morning 5B1E6981-2E50-4D9A-99D8-67AED430C5A8 0 + 2016-11-30 morning 5B1E6981-2E50-4D9A-99D8-67AED430C5A8 0 + 2016-11-30 afternoon 55C836F5-487E-405F-8E28-21DBD40FA4FF 1 + 2017-05-07 morning 5C5A9C41-2F68-4CEB-96D0-77DE3729B729 0 + 2017-05-07 morning 25262DC7-780C-4AD5-AD3A-D9776AEF7FC1 0 + 2017-05-07 morning 5B1E6981-2E50-4D9A-99D8-67AED430C5A8 0 + 2017-05-07 morning 6C444841-FE64-4375-BC3F-FA410CDC0AC7 0 + 2017-05-07 morning 4DC7A22D-9F1F-4DEF-8576-086910AABCB5 0 + 2017-05-07 afternoon 5B1E6981-2E50-4D9A-99D8-67AED430C5A8 0 + 2017-05-07 afternoon 499A1EAF-DDF1-4657-986C-EA5032104448 1 + 2017-05-07 afternoon 499A1EAF-DDF1-4657-986C-EA5032104448 1 + 2017-05-07 afternoon 499A1EAF-DDF1-4657-986C-EA5032104448 1 + 2017-05-07 afternoon 499A1EAF-DDF1-4657-986C-EA5032104448 1 + 2017-05-07 afternoon 499A1EAF-DDF1-4657-986C-EA5032104448 1 + 2017-05-07 afternoon 499A1EAF-DDF1-4657-986C-EA5032104448 1 + 2017-05-07 afternoon 499A1EAF-DDF1-4657-986C-EA5032104448 1 + 2017-05-07 afternoon 499A1EAF-DDF1-4657-986C-EA5032104448 1 + ``` + + + + + ??? example "The most and least frequent `OTHER` devices (`own_device == 0`) during morning segments" + The most and least frequent `ALL`|`OWN`|`OTHER` devices are computed within each time segment instance, across time segment instances of the same type and across the entire dataset of each person. These are the most and least frequent devices for `OTHER` devices during morning segments. + ```csv + most frequent device across 2016-11-29 morning: '48872A52-68DE-420D-98DA-73339A1C4685' (this device is the only one in this instance) + least frequent device across 2016-11-29 morning: '48872A52-68DE-420D-98DA-73339A1C4685' (this device is the only one in this instance) + most frequent device across 2016-11-30 morning: '5B1E6981-2E50-4D9A-99D8-67AED430C5A8' + least frequent device across 2016-11-30 morning: '25262DC7-780C-4AD5-AD3A-D9776AEF7FC1' (when tied, the first occurance is chosen) + most frequent device across 2017-05-07 morning: '25262DC7-780C-4AD5-AD3A-D9776AEF7FC1' (when tied, the first occurance is chosen) + least frequent device across 2017-05-07 morning: '25262DC7-780C-4AD5-AD3A-D9776AEF7FC1' (when tied, the first occurance is chosen) + + most frequent across morning segments: '5B1E6981-2E50-4D9A-99D8-67AED430C5A8' + least frequent across morning segments: '6C444841-FE64-4375-BC3F-FA410CDC0AC7' (when tied, the first occurance is chosen) + + most frequent across dataset: '499A1EAF-DDF1-4657-986C-EA5032104448' (only taking into account "morning" segments) + least frequent across dataset: '4DC7A22D-9F1F-4DEF-8576-086910AABCB5' (when tied, the first occurance is chosen) + ``` + + ??? example "Bluetooth features for `OTHER` devices and morning segments" + For brevity we only show the following features for morning segments: + ```yaml + OTHER: + DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"] + SCANS_MOST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"] + ``` + + Note that `countscansmostfrequentdeviceacrossdatasetothers` is all `0`s because `499A1EAF-DDF1-4657-986C-EA5032104448` is excluded from the count as is labelled as an `own` device (not `other`). + ```csv + local_segment countscansothers uniquedevicesothers meanscansothers stdscansothers countscansmostfrequentdevicewithinsegmentsothers countscansmostfrequentdeviceacrosssegmentsothers countscansmostfrequentdeviceacrossdatasetothers + 2016-11-29-morning 1 1 1.000000 NaN 1 0.0 0.0 + 2016-11-30-morning 4 3 1.333333 0.57735 2 2.0 2.0 + 2017-05-07-morning 5 5 1.000000 0.00000 1 1.0 1.0 + ``` diff --git a/src/features/phone_bluetooth/doryab/main.py b/src/features/phone_bluetooth/doryab/main.py index 32b8747f..76b6eb94 100644 --- a/src/features/phone_bluetooth/doryab/main.py +++ b/src/features/phone_bluetooth/doryab/main.py @@ -2,7 +2,7 @@ import pandas as pd import numpy as np from sklearn.cluster import KMeans -def deviceFeatures(devices, ownership, features_to_compute, features): +def deviceFeatures(devices, ownership, common_devices, features_to_compute, features): if devices.shape[0] == 0: device_value_counts = pd.DataFrame(columns=["local_segment", "bt_address", "scans"], dtype=int) else: @@ -12,14 +12,29 @@ def deviceFeatures(devices, ownership, features_to_compute, features): features = features.join(device_value_counts.groupby("local_segment")["scans"].sum().to_frame("countscans" + ownership), how="outer") if "uniquedevices" in features_to_compute: features = features.join(device_value_counts.groupby("local_segment")["bt_address"].nunique().to_frame("uniquedevices" + ownership), how="outer") - if "countscansmostuniquedevice" in features_to_compute: - features = features.join(device_value_counts.groupby("local_segment")["scans"].max().to_frame("countscansmostuniquedevice" + ownership), how="outer") - if "countscansleastuniquedevice" in features_to_compute: - features = features.join(device_value_counts.groupby("local_segment")["scans"].min().to_frame("countscansleastuniquedevice" + ownership), how="outer") if "meanscans" in features_to_compute: features = features.join(device_value_counts.groupby("local_segment")["scans"].mean().to_frame("meanscans" + ownership), how="outer") if "stdscans" in features_to_compute: features = features.join(device_value_counts.groupby("local_segment")["scans"].std().to_frame("stdscans" + ownership), how="outer") + # Most frequent device within segments, across segments, and across dataset + if "countscansmostfrequentdevicewithinsegments" in features_to_compute: + features = features.join(device_value_counts.groupby("local_segment")["scans"].max().to_frame("countscansmostfrequentdevicewithinsegments" + ownership), how="outer") + if "countscansmostfrequentdeviceacrosssegments" in features_to_compute: + common_device = common_devices['most_segments'] + features = features.join(device_value_counts.query("bt_address in @common_device").groupby("local_segment")["scans"].max().to_frame("countscansmostfrequentdeviceacrosssegments" + ownership), how="outer") + if "countscansmostfrequentdeviceacrossdataset" in features_to_compute: + common_device = common_devices['most_dataset'] + features = features.join(device_value_counts.query("bt_address in @common_device").groupby("local_segment")["scans"].max().to_frame("countscansmostfrequentdeviceacrossdataset" + ownership), how="outer") + # Least frequent device within segments, across segments, and across dataset + if "countscansleastfrequentdevicewithinsegments" in features_to_compute: + features = features.join(device_value_counts.groupby("local_segment")["scans"].min().to_frame("countscansleastfrequentdevicewithinsegments" + ownership), how="outer") + if "countscansleastfrequentdeviceacrosssegments" in features_to_compute: + common_device = common_devices['least_segments'] + features = features.join(device_value_counts.query("bt_address in @common_device").groupby("local_segment")["scans"].min().to_frame("countscansleastfrequentdeviceacrosssegments" + ownership), how="outer") + if "countscansleastfrequentdeviceacrossdataset" in features_to_compute: + common_device = common_devices['least_dataset'] + features = features.join(device_value_counts.query("bt_address in @common_device").groupby("local_segment")["scans"].min().to_frame("countscansleastfrequentdeviceacrossdataset" + ownership), how="outer") + return(features) def deviceFrequency(bt_data): @@ -77,30 +92,61 @@ def ownership_based_on_clustering(bt_frequency): maxcluster = np.where(labels == np.argmax(centers), 1, 0) bt_frequency["own_device"] = maxcluster return bt_frequency[["bt_address", "own_device"]] + +def mostLeastScannedDevices(devices): + device_counts = devices["bt_address"].value_counts() + return ("","") if (len(device_counts) == 0) else (device_counts.idxmax(), device_counts.idxmin()) + +def validate_requested_features(provider): + base_features = {"DEVICES": set(["countscans", "uniquedevices", "meanscans", "stdscans"]), + "SCANS_MOST_FREQUENT_DEVICE": set(["withinsegments", "acrosssegments", "acrossdataset"]), + "SCANS_LEAST_FREQUENT_DEVICE": set(["withinsegments", "acrosssegments", "acrossdataset"])} + + # Check we have three arrays of features + ownership_keys = [x.lower() for x in provider["FEATURES"].keys()] + if set(ownership_keys) != set(["own", "others", "all"]): + raise ValueError("[PHONE_BLUETOOTH][DORYAB][FEATURES] config key must have three types called ALL, OWN and OTHERS, instead you provided {}".format(ownership_keys)) + # Check each array contains valid features + for ownership_key in provider["FEATURES"].keys(): + for type_key in provider["FEATURES"][ownership_key]: + if len(provider["FEATURES"][ownership_key][type_key]) > 0 and not set(provider["FEATURES"][ownership_key][type_key]) <= base_features[type_key]: + raise ValueError("[PHONE_BLUETOOTH][DORYAB][FEATURES][{}][{}] config key only supports features called [{}], instead you provided [{}]".format(ownership_key, type_key, ",".join(base_features[type_key]), ",".join(set(provider["FEATURES"][ownership_key][type_key]) - base_features[type_key]))) def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): bt_data = pd.read_csv(sensor_data_files["sensor_data"]) - base_features = set(["countscans", "uniquedevices", "countscansmostuniquedevice", "countscansleastuniquedevice", "meanscans", "stdscans"]) - ownership_keys = [x.lower() for x in provider["FEATURES"].keys()] - if set(ownership_keys) != set(["own", "others", "all"]): - raise ValueError("[PHONE_BLUETOOTH][DORYAB][FEATURES] config key can only have three lists called ALL, OWN and OTHERS, instead you provided {}".format(ownership_keys)) - + feature_prefix = {"DEVICES":"", "SCANS_MOST_FREQUENT_DEVICE":"countscansmostfrequentdevice", "SCANS_LEAST_FREQUENT_DEVICE":"countscansleastfrequentdevice"} + validate_requested_features(provider) + device_ownership = ownership_based_on_clustering(deviceFrequency(bt_data)).set_index("bt_address") bt_data = bt_data.set_index("bt_address").join(device_ownership, how="left").reset_index() bt_data["own_device"].fillna(0, inplace=True) - segment_bt_data = filter_data_by_segment(bt_data, time_segment) + dataset_most_common_device, dataset_least_common_device = mostLeastScannedDevices(bt_data) + segment_bt_data = filter_data_by_segment(bt_data.head(0), time_segment) features = pd.DataFrame(columns=['local_segment']).set_index("local_segment") for ownership in provider["FEATURES"].keys(): - features_to_compute = list(set(provider["FEATURES"][ownership]) & base_features) + + features_to_compute = [] + for type_key in provider["FEATURES"][ownership]: + features_to_compute = features_to_compute + [feature_prefix[type_key] + feature for feature in provider["FEATURES"][ownership][type_key]] + if ownership == "OWN": owner_segment_bt_data = segment_bt_data.query("own_device == 1") elif ownership == "OTHERS": owner_segment_bt_data = segment_bt_data.query("own_device == 0") else: #ALL owner_segment_bt_data = segment_bt_data - features = deviceFeatures(owner_segment_bt_data, ownership.lower(), features_to_compute, features) - + + segment_most_common_device, segment_least_common_device = mostLeastScannedDevices(owner_segment_bt_data) + common_devices = {"most_dataset": dataset_most_common_device, "least_dataset": dataset_least_common_device, + "most_segments": segment_most_common_device, "least_segments": segment_least_common_device} + + features = deviceFeatures(owner_segment_bt_data, ownership.lower(), common_devices, features_to_compute, features) features = features.reset_index() + + # Impute all NaN except for std dev + for column in features: + if column not in ["stdscansall", "stdscansown", "stdscansothers"]: + features[column].fillna(0.0, inplace=True) return features From 05627296f4aa54baaebad011632274e3cbdbc187 Mon Sep 17 00:00:00 2001 From: JulioV Date: Sat, 12 Dec 2020 17:10:59 -0500 Subject: [PATCH 3/5] Fix filter_data_by_segment bug --- src/features/utils/utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/features/utils/utils.py b/src/features/utils/utils.py index 6938f875..d250c889 100644 --- a/src/features/utils/utils.py +++ b/src/features/utils/utils.py @@ -1,13 +1,17 @@ rapids_log_tag = "RAPIDS:" def filter_data_by_segment(data, time_segment): + if(data.shape[0] == 0): # data is empty + data["local_segment"] = data["timestamps_segment"] = None + return data + datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}" timestamps_regex = "[0-9]{13}" segment_regex = "\[({}#{},{};{},{})\]".format(time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex) data["local_segment"] = data["assigned_segments"].str.extract(segment_regex, expand=True) data = data.drop(columns=["assigned_segments"]) data = data.dropna(subset = ["local_segment"]) - if(data.shape[0] == 0): # there are no rows belonging to time_segment + if(data.shape[0] == 0): # there are no rows belonging to time_segment after droping na data["timestamps_segment"] = None else: data[["local_segment","timestamps_segment"]] = data["local_segment"].str.split(pat =";",n=1, expand=True) From 63202c62cca8107ca1f09bb9da0fcf936feb5700 Mon Sep 17 00:00:00 2001 From: JulioV Date: Sat, 12 Dec 2020 17:11:48 -0500 Subject: [PATCH 4/5] Fix BT doryab bug and update change log --- docs/change-log.md | 5 ++++- src/features/phone_bluetooth/doryab/main.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/change-log.md b/docs/change-log.md index 749be5ef..ebff7391 100644 --- a/docs/change-log.md +++ b/docs/change-log.md @@ -1,6 +1,9 @@ # Change Log -## Release in progress +## v0.2.0 +- Add new `PHONE_BLUETOOTH` `DORYAB` provider +- Deprecate `PHONE_BLUETOOTH` `RAPIDS` provider +- Fix bug in `filter_data_by_segment` for Python when dataset was empty - Minor doc updates - New FAQ item diff --git a/src/features/phone_bluetooth/doryab/main.py b/src/features/phone_bluetooth/doryab/main.py index 76b6eb94..e571eb09 100644 --- a/src/features/phone_bluetooth/doryab/main.py +++ b/src/features/phone_bluetooth/doryab/main.py @@ -123,7 +123,7 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se bt_data = bt_data.set_index("bt_address").join(device_ownership, how="left").reset_index() bt_data["own_device"].fillna(0, inplace=True) dataset_most_common_device, dataset_least_common_device = mostLeastScannedDevices(bt_data) - segment_bt_data = filter_data_by_segment(bt_data.head(0), time_segment) + segment_bt_data = filter_data_by_segment(bt_data, time_segment) features = pd.DataFrame(columns=['local_segment']).set_index("local_segment") for ownership in provider["FEATURES"].keys(): From 1359acf5fdfedb985355c040a82f6a6936717981 Mon Sep 17 00:00:00 2001 From: JulioV Date: Sat, 12 Dec 2020 17:13:04 -0500 Subject: [PATCH 5/5] Update git flow --- docs/developers/git-flow.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/developers/git-flow.md b/docs/developers/git-flow.md index 0e1ef808..130524b1 100644 --- a/docs/developers/git-flow.md +++ b/docs/developers/git-flow.md @@ -29,6 +29,7 @@ git commit -m "Add my new feature" # use a concise description ```bash git checkout feature/feature1 + git pull origin develop git rebase -i develop git checkout develop git merge --no-ff feature/feature1 # (use the default merge message)