Merge branch 'develop' of https://github.com/carissalow/rapids into develop

pull/108/head
abhineethreddyk 2020-12-14 22:16:12 -05:00
commit 6234675a36
12 changed files with 402 additions and 8 deletions

View File

@ -7,6 +7,6 @@
**R**eproducible **A**nalysis **Pi**peline for **D**ata **S**treams **R**eproducible **A**nalysis **Pi**peline for **D**ata **S**treams
For more information refer to our [documentation](https://www.rapids.science) For more information refer to our [documentation](http://www.rapids.science)
By [MoSHI](https://www.moshi.pitt.edu/), [University of Pittsburgh](https://www.pitt.edu/) By [MoSHI](https://www.moshi.pitt.edu/), [University of Pittsburgh](https://www.pitt.edu/)

View File

@ -100,6 +100,15 @@ for provider in config["PHONE_LIGHT"]["PROVIDERS"].keys():
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
for provider in config["PHONE_KEYBOARD"]["PROVIDERS"].keys():
if config["PHONE_KEYBOARD"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_keyboard_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_keyboard_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_keyboard_features/phone_keyboard_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_KEYBOARD"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_keyboard.csv", pid=config["PIDS"],))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
for provider in config["PHONE_ACCELEROMETER"]["PROVIDERS"].keys(): for provider in config["PHONE_ACCELEROMETER"]["PROVIDERS"].keys():
if config["PHONE_ACCELEROMETER"]["PROVIDERS"][provider]["COMPUTE"]: if config["PHONE_ACCELEROMETER"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_accelerometer_raw.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/raw/{pid}/phone_accelerometer_raw.csv", pid=config["PIDS"]))

View File

@ -127,6 +127,23 @@ PHONE_BLUETOOTH:
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
SRC_FOLDER: "rapids" # inside src/features/phone_bluetooth SRC_FOLDER: "rapids" # inside src/features/phone_bluetooth
SRC_LANGUAGE: "r" SRC_LANGUAGE: "r"
DORYAB:
COMPUTE: FALSE
FEATURES:
ALL:
DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"]
SCANS_MOST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"]
SCANS_LEAST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"]
OWN:
DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"]
SCANS_MOST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"]
SCANS_LEAST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"]
OTHERS:
DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"]
SCANS_MOST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"]
SCANS_LEAST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"]
SRC_FOLDER: "doryab" # inside src/features/phone_bluetooth
SRC_LANGUAGE: "python"
# See https://www.rapids.science/latest/features/phone-calls/ # See https://www.rapids.science/latest/features/phone-calls/
PHONE_CALLS: PHONE_CALLS:
@ -172,6 +189,15 @@ PHONE_DATA_YIELD:
SRC_LANGUAGE: "r" SRC_LANGUAGE: "r"
SRC_FOLDER: "rapids" # inside src/features/phone_data_yield SRC_FOLDER: "rapids" # inside src/features/phone_data_yield
PHONE_KEYBOARD:
TABLE: keyboard
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: []
SRC_FOLDER: "rapids" # inside src/features/phone_keyboard
SRC_LANGUAGE: "python"
# See https://www.rapids.science/latest/features/phone-light/ # See https://www.rapids.science/latest/features/phone-light/
PHONE_LIGHT: PHONE_LIGHT:
TABLE: light TABLE: light

View File

@ -1,6 +1,14 @@
# Change Log # Change Log
## Release in progress ## v0.2.2
- Fix readme link to docs
## v0.2.1
- FIx link to the most recent version in the old version banner
## v0.2.0
- Add new `PHONE_BLUETOOTH` `DORYAB` provider
- Deprecate `PHONE_BLUETOOTH` `RAPIDS` provider
- Fix bug in `filter_data_by_segment` for Python when dataset was empty
- Minor doc updates - Minor doc updates
- New FAQ item - New FAQ item

View File

@ -28,6 +28,13 @@ If you computed applications foreground features using the app category (genre)
!!! cite "Stachl et al. citation" !!! cite "Stachl et al. citation"
Clemens Stachl, Quay Au, Ramona Schoedel, Samuel D. Gosling, Gabriella M. Harari, Daniel Buschek, Sarah Theres Völkel, Tobias Schuwerk, Michelle Oldemeier, Theresa Ullmann, Heinrich Hussmann, Bernd Bischl, Markus Bühner. Proceedings of the National Academy of Sciences Jul 2020, 117 (30) 17680-17687; DOI: 10.1073/pnas.1920484117 Clemens Stachl, Quay Au, Ramona Schoedel, Samuel D. Gosling, Gabriella M. Harari, Daniel Buschek, Sarah Theres Völkel, Tobias Schuwerk, Michelle Oldemeier, Theresa Ullmann, Heinrich Hussmann, Bernd Bischl, Markus Bühner. Proceedings of the National Academy of Sciences Jul 2020, 117 (30) 17680-17687; DOI: 10.1073/pnas.1920484117
## Doryab (bluetooth)
If you computed bluetooth features using the provider `[PHONE_BLUETOOTH][DORYAB]` cite [this paper](https://arxiv.org/abs/1812.10394) in addition to RAPIDS.
!!! cite "Doryab et al. citation"
Doryab, A., Chikarsel, P., Liu, X., & Dey, A. K. (2019). Extraction of Behavioral Features from Smartphone and Wearable Data. ArXiv:1812.10394 [Cs, Stat]. http://arxiv.org/abs/1812.10394
## Barnett (locations) ## Barnett (locations)
If you computed locations features using the provider `[PHONE_LOCATIONS][BARNETT]` cite [this paper](https://doi.org/10.1093/biostatistics/kxy059) and [this paper](https://doi.org/10.1145/2750858.2805845) in addition to RAPIDS. If you computed locations features using the provider `[PHONE_LOCATIONS][BARNETT]` cite [this paper](https://doi.org/10.1093/biostatistics/kxy059) and [this paper](https://doi.org/10.1145/2750858.2805845) in addition to RAPIDS.

View File

@ -15,6 +15,7 @@ git pull
git checkout -b feature/feature1 git checkout -b feature/feature1
``` ```
1. Add, modify or delete the necessary files to add your new feature 1. Add, modify or delete the necessary files to add your new feature
1. Update the [change log](../../change-log) (`docs/change-log.md`)
2. Stage and commit your changes using VS Code git GUI or the following commands 2. Stage and commit your changes using VS Code git GUI or the following commands
```bash ```bash
git add modified-file1 modified-file2 git add modified-file1 modified-file2
@ -29,6 +30,7 @@ git commit -m "Add my new feature" # use a concise description
```bash ```bash
git checkout feature/feature1 git checkout feature/feature1
git pull origin develop
git rebase -i develop git rebase -i develop
git checkout develop git checkout develop
git merge --no-ff feature/feature1 # (use the default merge message) git merge --no-ff feature/feature1 # (use the default merge message)
@ -88,6 +90,7 @@ git describe --abbrev=0 --tags # Bump the hotfix (0.1.0 to 0.1.1 => NEW_HOTFIX)
git checkout -b hotfix/v[NEW_HOTFIX] master git checkout -b hotfix/v[NEW_HOTFIX] master
``` ```
1. Fix whatever needs to be fixed 1. Fix whatever needs to be fixed
1. Update the change log
1. Tag and merge the hotfix 1. Tag and merge the hotfix
```bash ```bash
git tag v[NEW_HOTFIX] git tag v[NEW_HOTFIX]

View File

@ -8,6 +8,9 @@ Sensor parameters description for `[PHONE_BLUETOOTH]`:
## RAPIDS provider ## RAPIDS provider
!!! warning
The features of this provider are deprecated in favor of `DORYAB` provider (see below).
!!! info "Available time segments and platforms" !!! info "Available time segments and platforms"
- Available for all time segments - Available for all time segments
- Available for Android only - Available for Android only
@ -33,9 +36,125 @@ Features description for `[PHONE_BLUETOOTH][PROVIDERS][RAPIDS]`:
|Feature |Units |Description| |Feature |Units |Description|
|-------------------------- |---------- |---------------------------| |-------------------------- |---------- |---------------------------|
| countscans | devices | Number of scanned devices during a `time_segment`, a device can be detected multiple times over time and these appearances are counted separately | | {--countscans--} | devices | Number of scanned devices during a time segment, a device can be detected multiple times over time and these appearances are counted separately |
| uniquedevices | devices | Number of unique devices during a `time_segment` as identified by their hardware (`bt_address`) address | | {--uniquedevices--} | devices | Number of unique devices during a time segment as identified by their hardware (`bt_address`) address |
| countscansmostuniquedevice | scans | Number of scans of the most scanned device during a `time_segment` across the whole monitoring period | | {--countscansmostuniquedevice--} | scans | Number of scans of the most sensed device within each time segment instance |
!!! note "Assumptions/Observations" !!! note "Assumptions/Observations"
NA - From `v0.2.0` `countscans`, `uniquedevices`, `countscansmostuniquedevice` were deprecated because they overlap with the respective features for `ALL` devices of the `PHONE_BLUETOOTH` `DORYAB` provider
## DORYAB provider
This provider is adapted from the work by [Doryab et al](../../citation#doryab-bluetooth).
!!! info "Available time segments and platforms"
- Available for all time segments
- Available for Android only
!!! info "File Sequence"
```bash
- data/raw/{pid}/phone_bluetooth_raw.csv
- data/raw/{pid}/phone_bluetooth_with_datetime.csv
- data/interim/{pid}/phone_bluetooth_features/phone_bluetooth_{language}_{provider_key}.csv
- data/processed/features/{pid}/phone_bluetooth.csv"
```
Parameters description for `[PHONE_BLUETOOTH][PROVIDERS][DORYAB]`:
|Key                              | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[COMPUTE]`| Set to `True` to extract `PHONE_BLUETOOTH` features from the `DORYAB` provider|
|`[FEATURES]` | Features to be computed, see table below. These features are computed for three device categories: `all` devices, `own` devices and `other` devices.
Features description for `[PHONE_BLUETOOTH][PROVIDERS][DORYAB]`:
|Feature                                                                                   |Units |Description|
|-------------------------- |---------- |---------------------------|
| countscans | scans | Number of scans (rows) from the devices sensed during a time segment instance. The more scans a bluetooth device has the longer it remained within range of the participant's phone |
| uniquedevices | devices | Number of unique bluetooth devices sensed during a time segment instance as identified by their hardware addresses (`bt_address`) |
| meanscans | scans| Mean of the scans of every sensed device within each time segment instance|
| stdscans | scans| Standard deviation of the scans of every sensed device within each time segment instance|
| countscans{==most==}frequentdevice{==within==}segments | scans | Number of scans of the **most** sensed device **within** each time segment instance|
| countscans{==least==}frequentdevice{==within==}segments| scans| Number of scans of the **least** sensed device **within** each time segment instance |
| countscans{==most==}frequentdevice{==across==}segments | scans | Number of scans of the **most** sensed device **across** time segment instances of the same type|
| countscans{==least==}frequentdevice{==across==}segments| scans| Number of scans of the **least** sensed device **across** time segment instances of the same type per device|
| countscans{==most==}frequentdevice{==acrossdataset==} | scans | Number of scans of the **most** sensed device **across** the entire dataset of every participant|
| countscans{==least==}frequentdevice{==acrossdataset==}| scans| Number of scans of the **least** sensed device **across** the entire dataset of every participant |
!!! note "Assumptions/Observations"
- Devices are classified as belonging to the participant (`own`) or to other people (`others`) using k-means based on the number of times and the number of days each device was detected across each participant's dataset. See [Doryab et al](../../citation#doryab-bluetooth) for more details.
- If ownership cannot be computed because all devices were detected on only one day, they are all considered as `other`. Thus `all` and `other` features will be equal. The likelihood of this scenario decreases the more days of data you have.
- The most and least frequent devices will be the same across time segment instances and across the entire dataset when every time segment instance covers every hour of a dataset. For example, daily segments (00:00 to 23:59) fall in this category but morning segments (06:00am to 11:59am) or periodic 30-minute segments don't.
??? info "Example"
??? example "Simplified raw bluetooth data"
The following is a simplified example with bluetooth data from three days and two time segments: morning and afternoon. There are two `own` devices: `5C836F5-487E-405F-8E28-21DBD40FA4FF` detected seven times across two days and `499A1EAF-DDF1-4657-986C-EA5032104448` detected eight times on a single day.
```csv
local_date segment bt_address own_device
2016-11-29 morning 55C836F5-487E-405F-8E28-21DBD40FA4FF 1
2016-11-29 morning 55C836F5-487E-405F-8E28-21DBD40FA4FF 1
2016-11-29 morning 55C836F5-487E-405F-8E28-21DBD40FA4FF 1
2016-11-29 morning 55C836F5-487E-405F-8E28-21DBD40FA4FF 1
2016-11-29 morning 48872A52-68DE-420D-98DA-73339A1C4685 0
2016-11-29 afternoon 55C836F5-487E-405F-8E28-21DBD40FA4FF 1
2016-11-29 afternoon 48872A52-68DE-420D-98DA-73339A1C4685 0
2016-11-30 morning 55C836F5-487E-405F-8E28-21DBD40FA4FF 1
2016-11-30 morning 48872A52-68DE-420D-98DA-73339A1C4685 0
2016-11-30 morning 25262DC7-780C-4AD5-AD3A-D9776AEF7FC1 0
2016-11-30 morning 5B1E6981-2E50-4D9A-99D8-67AED430C5A8 0
2016-11-30 morning 5B1E6981-2E50-4D9A-99D8-67AED430C5A8 0
2016-11-30 afternoon 55C836F5-487E-405F-8E28-21DBD40FA4FF 1
2017-05-07 morning 5C5A9C41-2F68-4CEB-96D0-77DE3729B729 0
2017-05-07 morning 25262DC7-780C-4AD5-AD3A-D9776AEF7FC1 0
2017-05-07 morning 5B1E6981-2E50-4D9A-99D8-67AED430C5A8 0
2017-05-07 morning 6C444841-FE64-4375-BC3F-FA410CDC0AC7 0
2017-05-07 morning 4DC7A22D-9F1F-4DEF-8576-086910AABCB5 0
2017-05-07 afternoon 5B1E6981-2E50-4D9A-99D8-67AED430C5A8 0
2017-05-07 afternoon 499A1EAF-DDF1-4657-986C-EA5032104448 1
2017-05-07 afternoon 499A1EAF-DDF1-4657-986C-EA5032104448 1
2017-05-07 afternoon 499A1EAF-DDF1-4657-986C-EA5032104448 1
2017-05-07 afternoon 499A1EAF-DDF1-4657-986C-EA5032104448 1
2017-05-07 afternoon 499A1EAF-DDF1-4657-986C-EA5032104448 1
2017-05-07 afternoon 499A1EAF-DDF1-4657-986C-EA5032104448 1
2017-05-07 afternoon 499A1EAF-DDF1-4657-986C-EA5032104448 1
2017-05-07 afternoon 499A1EAF-DDF1-4657-986C-EA5032104448 1
```
??? example "The most and least frequent `OTHER` devices (`own_device == 0`) during morning segments"
The most and least frequent `ALL`|`OWN`|`OTHER` devices are computed within each time segment instance, across time segment instances of the same type and across the entire dataset of each person. These are the most and least frequent devices for `OTHER` devices during morning segments.
```csv
most frequent device across 2016-11-29 morning: '48872A52-68DE-420D-98DA-73339A1C4685' (this device is the only one in this instance)
least frequent device across 2016-11-29 morning: '48872A52-68DE-420D-98DA-73339A1C4685' (this device is the only one in this instance)
most frequent device across 2016-11-30 morning: '5B1E6981-2E50-4D9A-99D8-67AED430C5A8'
least frequent device across 2016-11-30 morning: '25262DC7-780C-4AD5-AD3A-D9776AEF7FC1' (when tied, the first occurance is chosen)
most frequent device across 2017-05-07 morning: '25262DC7-780C-4AD5-AD3A-D9776AEF7FC1' (when tied, the first occurance is chosen)
least frequent device across 2017-05-07 morning: '25262DC7-780C-4AD5-AD3A-D9776AEF7FC1' (when tied, the first occurance is chosen)
most frequent across morning segments: '5B1E6981-2E50-4D9A-99D8-67AED430C5A8'
least frequent across morning segments: '6C444841-FE64-4375-BC3F-FA410CDC0AC7' (when tied, the first occurance is chosen)
most frequent across dataset: '499A1EAF-DDF1-4657-986C-EA5032104448' (only taking into account "morning" segments)
least frequent across dataset: '4DC7A22D-9F1F-4DEF-8576-086910AABCB5' (when tied, the first occurance is chosen)
```
??? example "Bluetooth features for `OTHER` devices and morning segments"
For brevity we only show the following features for morning segments:
```yaml
OTHER:
DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"]
SCANS_MOST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"]
```
Note that `countscansmostfrequentdeviceacrossdatasetothers` is all `0`s because `499A1EAF-DDF1-4657-986C-EA5032104448` is excluded from the count as is labelled as an `own` device (not `other`).
```csv
local_segment countscansothers uniquedevicesothers meanscansothers stdscansothers countscansmostfrequentdevicewithinsegmentsothers countscansmostfrequentdeviceacrosssegmentsothers countscansmostfrequentdeviceacrossdatasetothers
2016-11-29-morning 1 1 1.000000 NaN 1 0.0 0.0
2016-11-30-morning 4 3 1.333333 0.57735 2 2.0 2.0
2017-05-07-morning 5 5 1.000000 0.00000 1 1.0 1.0
```

View File

@ -8,7 +8,7 @@ window.addEventListener("DOMContentLoaded", function() {
if(versions[id]["aliases"].length > 0 && versions[id]["aliases"].includes("latest")) if(versions[id]["aliases"].length > 0 && versions[id]["aliases"].includes("latest"))
latest_version = "/" + versions[id].version + "/" latest_version = "/" + versions[id].version + "/"
if(!window.location.pathname.includes("/latest/") && (latest_version.length > 0 && !window.location.pathname.includes(latest_version))) if(!window.location.pathname.includes("/latest/") && (latest_version.length > 0 && !window.location.pathname.includes(latest_version)))
document.querySelector("div[data-md-component=announce]").innerHTML = "<div id='announce-msg'>You are seeing the docs for a previous version of RAPIDS, <a href='" + window.location.href + "latest/'>click here to go to the latest</a></div>" document.querySelector("div[data-md-component=announce]").innerHTML = "<div id='announce-msg'>You are seeing the docs for a previous version of RAPIDS, <a href='http://www.rapids.science/'>click here to go to the latest</a></div>"
}; };
xhr.send(); xhr.send();
}); });

View File

@ -262,6 +262,32 @@ rule phone_light_r_features:
script: script:
"../src/features/entry.R" "../src/features/entry.R"
rule phone_keyboard_python_features:
input:
sensor_data = "data/raw/{pid}/phone_keyboard_with_datetime.csv",
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
params:
provider = lambda wildcards: config["PHONE_KEYBOARD"]["PROVIDERS"][wildcards.provider_key.upper()],
provider_key = "{provider_key}",
sensor_key = "phone_keyboard"
output:
"data/interim/{pid}/phone_keyboard_features/phone_keyboard_python_{provider_key}.csv"
script:
"../src/features/entry.py"
rule phone_keyboard_r_features:
input:
sensor_data = "data/raw/{pid}/phone_keyboard_with_datetime.csv",
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
params:
provider = lambda wildcards: config["PHONE_KEYBOARD"]["PROVIDERS"][wildcards.provider_key.upper()],
provider_key = "{provider_key}",
sensor_key = "phone_keyboard"
output:
"data/interim/{pid}/phone_keyboard_features/phone_keyboard_r_{provider_key}.csv"
script:
"../src/features/entry.R"
rule phone_locations_python_features: rule phone_locations_python_features:
input: input:
sensor_data = "data/interim/{pid}/phone_locations_processed_with_datetime.csv", sensor_data = "data/interim/{pid}/phone_locations_processed_with_datetime.csv",

View File

@ -0,0 +1,152 @@
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
def deviceFeatures(devices, ownership, common_devices, features_to_compute, features):
if devices.shape[0] == 0:
device_value_counts = pd.DataFrame(columns=["local_segment", "bt_address", "scans"], dtype=int)
else:
device_value_counts = devices.groupby(["local_segment"])["bt_address"].value_counts().to_frame("scans").reset_index()
if "countscans" in features_to_compute:
features = features.join(device_value_counts.groupby("local_segment")["scans"].sum().to_frame("countscans" + ownership), how="outer")
if "uniquedevices" in features_to_compute:
features = features.join(device_value_counts.groupby("local_segment")["bt_address"].nunique().to_frame("uniquedevices" + ownership), how="outer")
if "meanscans" in features_to_compute:
features = features.join(device_value_counts.groupby("local_segment")["scans"].mean().to_frame("meanscans" + ownership), how="outer")
if "stdscans" in features_to_compute:
features = features.join(device_value_counts.groupby("local_segment")["scans"].std().to_frame("stdscans" + ownership), how="outer")
# Most frequent device within segments, across segments, and across dataset
if "countscansmostfrequentdevicewithinsegments" in features_to_compute:
features = features.join(device_value_counts.groupby("local_segment")["scans"].max().to_frame("countscansmostfrequentdevicewithinsegments" + ownership), how="outer")
if "countscansmostfrequentdeviceacrosssegments" in features_to_compute:
common_device = common_devices['most_segments']
features = features.join(device_value_counts.query("bt_address in @common_device").groupby("local_segment")["scans"].max().to_frame("countscansmostfrequentdeviceacrosssegments" + ownership), how="outer")
if "countscansmostfrequentdeviceacrossdataset" in features_to_compute:
common_device = common_devices['most_dataset']
features = features.join(device_value_counts.query("bt_address in @common_device").groupby("local_segment")["scans"].max().to_frame("countscansmostfrequentdeviceacrossdataset" + ownership), how="outer")
# Least frequent device within segments, across segments, and across dataset
if "countscansleastfrequentdevicewithinsegments" in features_to_compute:
features = features.join(device_value_counts.groupby("local_segment")["scans"].min().to_frame("countscansleastfrequentdevicewithinsegments" + ownership), how="outer")
if "countscansleastfrequentdeviceacrosssegments" in features_to_compute:
common_device = common_devices['least_segments']
features = features.join(device_value_counts.query("bt_address in @common_device").groupby("local_segment")["scans"].min().to_frame("countscansleastfrequentdeviceacrosssegments" + ownership), how="outer")
if "countscansleastfrequentdeviceacrossdataset" in features_to_compute:
common_device = common_devices['least_dataset']
features = features.join(device_value_counts.query("bt_address in @common_device").groupby("local_segment")["scans"].min().to_frame("countscansleastfrequentdeviceacrossdataset" + ownership), how="outer")
return(features)
def deviceFrequency(bt_data):
bt_data = bt_data[["local_date", "bt_address"]].dropna(subset=["bt_address"])
bt_data = bt_data.groupby("bt_address").agg({"local_date": pd.Series.nunique, "bt_address" : 'count'})
bt_data = bt_data.rename(columns={"local_date" : "days_scanned", "bt_address" : "scans"})
bt_data["scans_per_day"] = bt_data["scans"] / bt_data["days_scanned"]
return bt_data
def ownership_based_on_clustering(bt_frequency):
bt_frequency = bt_frequency.reset_index()
for col in ["scans_per_day", "days_scanned", "scans"]:
col_zscore = col + '_z'
bt_frequency[col_zscore] = (bt_frequency[col] - bt_frequency[col].mean()) / bt_frequency[col].std(ddof=0)
bt_frequency = bt_frequency.dropna(how='any')
if len(bt_frequency) == 0:
bt_frequency["own_device"] = None
return bt_frequency[["bt_address", "own_device"]]
avgfreq_z = bt_frequency["scans_per_day_z"]
numdays_z = bt_frequency["days_scanned_z"]
score = avgfreq_z + numdays_z
maxscore = np.max(score)
minscore = np.min(score)
midscore = (maxscore + minscore) / 2
initial_k2 = np.array([[maxscore], [minscore]], np.int32)
initial_k3 = np.array([[maxscore], [midscore], [minscore]], np.int32)
X_array = score.values
X = np.reshape(X_array, (len(score), 1))
# K = 2, devices I own VS devices other people own
kmeans_k2 = KMeans(n_clusters=2, init = initial_k2, n_init = 1).fit(X)
labels_k2 = kmeans_k2.labels_
centers_k2 = [c[0] for c in kmeans_k2.cluster_centers_]
diff_k2 = [(X_array[xi] - centers_k2[labels_k2[xi]])**2 for xi in range(0, len(X_array))]
sum_dist_k2 = sum(diff_k2)
# K = 3, devices I own VS devices my partner/roommate owns (can also be other devices I own though) VS devices other people own
kmeans_k3 = KMeans(n_clusters=3, init=initial_k3, n_init = 1).fit(X)
labels_k3 = kmeans_k3.labels_
centers_k3 = [c[0] for c in kmeans_k3.cluster_centers_]
diff_k3 = [(X_array[xi] - centers_k3[labels_k3[xi]])**2 for xi in range(0, len(X_array))]
sum_dist_k3 = sum(diff_k3)
if sum_dist_k2 < sum_dist_k3: # K = 2 is better
labels = labels_k2
centers = centers_k2
numclust = 2
else:
labels = labels_k3
centers = centers_k3
numclust = 3
maxcluster = np.where(labels == np.argmax(centers), 1, 0)
bt_frequency["own_device"] = maxcluster
return bt_frequency[["bt_address", "own_device"]]
def mostLeastScannedDevices(devices):
device_counts = devices["bt_address"].value_counts()
return ("","") if (len(device_counts) == 0) else (device_counts.idxmax(), device_counts.idxmin())
def validate_requested_features(provider):
base_features = {"DEVICES": set(["countscans", "uniquedevices", "meanscans", "stdscans"]),
"SCANS_MOST_FREQUENT_DEVICE": set(["withinsegments", "acrosssegments", "acrossdataset"]),
"SCANS_LEAST_FREQUENT_DEVICE": set(["withinsegments", "acrosssegments", "acrossdataset"])}
# Check we have three arrays of features
ownership_keys = [x.lower() for x in provider["FEATURES"].keys()]
if set(ownership_keys) != set(["own", "others", "all"]):
raise ValueError("[PHONE_BLUETOOTH][DORYAB][FEATURES] config key must have three types called ALL, OWN and OTHERS, instead you provided {}".format(ownership_keys))
# Check each array contains valid features
for ownership_key in provider["FEATURES"].keys():
for type_key in provider["FEATURES"][ownership_key]:
if len(provider["FEATURES"][ownership_key][type_key]) > 0 and not set(provider["FEATURES"][ownership_key][type_key]) <= base_features[type_key]:
raise ValueError("[PHONE_BLUETOOTH][DORYAB][FEATURES][{}][{}] config key only supports features called [{}], instead you provided [{}]".format(ownership_key, type_key, ",".join(base_features[type_key]), ",".join(set(provider["FEATURES"][ownership_key][type_key]) - base_features[type_key])))
def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
bt_data = pd.read_csv(sensor_data_files["sensor_data"])
feature_prefix = {"DEVICES":"", "SCANS_MOST_FREQUENT_DEVICE":"countscansmostfrequentdevice", "SCANS_LEAST_FREQUENT_DEVICE":"countscansleastfrequentdevice"}
validate_requested_features(provider)
device_ownership = ownership_based_on_clustering(deviceFrequency(bt_data)).set_index("bt_address")
bt_data = bt_data.set_index("bt_address").join(device_ownership, how="left").reset_index()
bt_data["own_device"].fillna(0, inplace=True)
dataset_most_common_device, dataset_least_common_device = mostLeastScannedDevices(bt_data)
segment_bt_data = filter_data_by_segment(bt_data, time_segment)
features = pd.DataFrame(columns=['local_segment']).set_index("local_segment")
for ownership in provider["FEATURES"].keys():
features_to_compute = []
for type_key in provider["FEATURES"][ownership]:
features_to_compute = features_to_compute + [feature_prefix[type_key] + feature for feature in provider["FEATURES"][ownership][type_key]]
if ownership == "OWN":
owner_segment_bt_data = segment_bt_data.query("own_device == 1")
elif ownership == "OTHERS":
owner_segment_bt_data = segment_bt_data.query("own_device == 0")
else: #ALL
owner_segment_bt_data = segment_bt_data
segment_most_common_device, segment_least_common_device = mostLeastScannedDevices(owner_segment_bt_data)
common_devices = {"most_dataset": dataset_most_common_device, "least_dataset": dataset_least_common_device,
"most_segments": segment_most_common_device, "least_segments": segment_least_common_device}
features = deviceFeatures(owner_segment_bt_data, ownership.lower(), common_devices, features_to_compute, features)
features = features.reset_index()
# Impute all NaN except for std dev
for column in features:
if column not in ["stdscansall", "stdscansown", "stdscansothers"]:
features[column].fillna(0.0, inplace=True)
return features

View File

@ -0,0 +1,40 @@
import pandas as pd
import numpy as np
def rapids_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
# I copied this from light, modify it to make it work for keyboard
light_data = pd.read_csv(sensor_data_files["sensor_data"])
print(light_data)
raise ValueError("Test")
requested_features = provider["FEATURES"]
# name of the features this function can compute
base_features_names = ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"]
# the subset of requested features this function can compute
features_to_compute = list(set(requested_features) & set(base_features_names))
light_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
if not light_data.empty:
light_data = filter_data_by_segment(light_data, time_segment)
if not light_data.empty:
light_features = pd.DataFrame()
if "count" in features_to_compute:
light_features["count"] = light_data.groupby(["local_segment"]).count()["timestamp"]
# get light ambient luminance related features
if "maxlux" in features_to_compute:
light_features["maxlux"] = light_data.groupby(["local_segment"])["double_light_lux"].max()
if "minlux" in features_to_compute:
light_features["minlux"] = light_data.groupby(["local_segment"])["double_light_lux"].min()
if "avglux" in features_to_compute:
light_features["avglux"] = light_data.groupby(["local_segment"])["double_light_lux"].mean()
if "medianlux" in features_to_compute:
light_features["medianlux"] = light_data.groupby(["local_segment"])["double_light_lux"].median()
if "stdlux" in features_to_compute:
light_features["stdlux"] = light_data.groupby(["local_segment"])["double_light_lux"].std()
light_features = light_features.reset_index()
return light_features

View File

@ -1,13 +1,17 @@
rapids_log_tag = "RAPIDS:" rapids_log_tag = "RAPIDS:"
def filter_data_by_segment(data, time_segment): def filter_data_by_segment(data, time_segment):
if(data.shape[0] == 0): # data is empty
data["local_segment"] = data["timestamps_segment"] = None
return data
datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}" datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
timestamps_regex = "[0-9]{13}" timestamps_regex = "[0-9]{13}"
segment_regex = "\[({}#{},{};{},{})\]".format(time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex) segment_regex = "\[({}#{},{};{},{})\]".format(time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex)
data["local_segment"] = data["assigned_segments"].str.extract(segment_regex, expand=True) data["local_segment"] = data["assigned_segments"].str.extract(segment_regex, expand=True)
data = data.drop(columns=["assigned_segments"]) data = data.drop(columns=["assigned_segments"])
data = data.dropna(subset = ["local_segment"]) data = data.dropna(subset = ["local_segment"])
if(data.shape[0] == 0): # there are no rows belonging to time_segment if(data.shape[0] == 0): # there are no rows belonging to time_segment after droping na
data["timestamps_segment"] = None data["timestamps_segment"] = None
else: else:
data[["local_segment","timestamps_segment"]] = data["local_segment"].str.split(pat =";",n=1, expand=True) data[["local_segment","timestamps_segment"]] = data["local_segment"].str.split(pat =";",n=1, expand=True)