diff --git a/Snakefile b/Snakefile index 04882a79..413438de 100644 --- a/Snakefile +++ b/Snakefile @@ -14,6 +14,15 @@ if len(config["PIDS"]) == 0: for provider in config["PHONE_DATA_YIELD"]["PROVIDERS"].keys(): if config["PHONE_DATA_YIELD"]["PROVIDERS"][provider]["COMPUTE"]: + + allowed_phone_sensors = get_phone_sensor_names() + if not (set(config["PHONE_DATA_YIELD"]["SENSORS"]) <= set(allowed_phone_sensors)): + raise ValueError('\nInvalid sensor(s) for PHONE_DATA_YIELD. config["PHONE_DATA_YIELD"]["SENSORS"] can have ' + 'one or more of the following phone sensors: {}.\nInstead you provided "{}".\n' + 'Keep in mind that the sensors\' TABLE attribute must point to a valid database table'\ + .format(', '.join(allowed_phone_sensors), + ', '.join(set(config["PHONE_DATA_YIELD"]["SENSORS"]) - set(allowed_phone_sensors)))) + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=map(str.lower, config["PHONE_DATA_YIELD"]["SENSORS"]))) files_to_compute.extend(expand("data/interim/{pid}/phone_yielded_timestamps.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_yielded_timestamps_with_datetime.csv", pid=config["PIDS"])) @@ -147,6 +156,49 @@ for provider in config["PHONE_CONVERSATION"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") +# We can delete these if's as soon as we add feature PROVIDERS to any of these sensors +if isinstance(config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"], dict): + for provider in config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"].keys(): + if config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_applications_crashes_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_applications_crashes_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_applications_crashes_with_datetime_with_categories.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_applications_crashes_features/phone_applications_crashes_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_applications_crashes.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +if isinstance(config["PHONE_APPLICATIONS_NOTIFICATIONS"]["PROVIDERS"], dict): + for provider in config["PHONE_APPLICATIONS_NOTIFICATIONS"]["PROVIDERS"].keys(): + if config["PHONE_APPLICATIONS_NOTIFICATIONS"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_applications_notifications_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_applications_notifications_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_applications_notifications_with_datetime_with_categories.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_applications_notifications_features/phone_applications_notifications_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_APPLICATIONS_NOTIFICATIONS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_applications_notifications.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +if isinstance(config["PHONE_KEYBOARD"]["PROVIDERS"], dict): + for provider in config["PHONE_KEYBOARD"]["PROVIDERS"].keys(): + if config["PHONE_KEYBOARD"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_keyboard_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_keyboard_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_keyboard_features/phone_keyboard_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_KEYBOARD"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_keyboard.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +if isinstance(config["PHONE_AWARE_LOG"]["PROVIDERS"], dict): + for provider in config["PHONE_AWARE_LOG"]["PROVIDERS"].keys(): + if config["PHONE_AWARE_LOG"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_aware_log_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_aware_log_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_aware_log_features/phone_aware_log_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_AWARE_LOG"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_aware_log.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys(): if config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["COMPUTE"]: if config["PHONE_LOCATIONS"]["LOCATIONS_TO_USE"] == "FUSED_RESAMPLED": diff --git a/config.yaml b/config.yaml index ac871093..96aa087a 100644 --- a/config.yaml +++ b/config.yaml @@ -85,6 +85,16 @@ PHONE_ACTIVITY_RECOGNITION: SRC_FOLDER: "rapids" # inside src/features/phone_activity_recognition SRC_LANGUAGE: "python" +# See https://www.rapids.science/latest/features/phone-applications-crashes/ +PHONE_APPLICATIONS_CRASHES: + TABLE: applications_crashes + APPLICATION_CATEGORIES: + CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store) + CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv" + UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE + SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway + PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD + # See https://www.rapids.science/latest/features/phone-applications-foreground/ PHONE_APPLICATIONS_FOREGROUND: TABLE: applications_foreground @@ -107,6 +117,21 @@ PHONE_APPLICATIONS_FOREGROUND: SRC_FOLDER: "rapids" # inside src/features/phone_applications_foreground SRC_LANGUAGE: "python" +# See https://www.rapids.science/latest/features/phone-applications-notifications/ +PHONE_APPLICATIONS_NOTIFICATIONS: + TABLE: applications_notifications + APPLICATION_CATEGORIES: + CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store) + CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv" + UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE + SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway + PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD + +# See https://www.rapids.science/latest/features/phone-aware-log/ +PHONE_AWARE_LOG: + TABLE: aware_log + PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD + # See https://www.rapids.science/latest/features/phone-battery/ PHONE_BATTERY: TABLE: battery @@ -189,6 +214,11 @@ PHONE_DATA_YIELD: SRC_LANGUAGE: "r" SRC_FOLDER: "rapids" # inside src/features/phone_data_yield +# See https://www.rapids.science/latest/features/phone-keyboard/ +PHONE_KEYBOARD: + TABLE: keyboard + PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD + # See https://www.rapids.science/latest/features/phone-light/ PHONE_LIGHT: TABLE: light @@ -202,20 +232,21 @@ PHONE_LIGHT: # See https://www.rapids.science/latest/features/phone-locations/ PHONE_LOCATIONS: TABLE: locations - LOCATIONS_TO_USE: FUSED_RESAMPLED # ALL, GPS OR FUSED_RESAMPLED + LOCATIONS_TO_USE: ALL_RESAMPLED # ALL, GPS, ALL_RESAMPLED, OR FUSED_RESAMPLED FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row PROVIDERS: DORYAB: COMPUTE: False FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"] - DBSCAN_EPS: 100 # meters + ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius + DBSCAN_EPS: 10 # meters DBSCAN_MINSAMPLES: 5 THRESHOLD_STATIC : 1 # km/h MAXIMUM_GAP_ALLOWED: 300 MINUTES_DATA_USED: False SAMPLING_FREQUENCY: 0 - CLUSTER_ON: TIME_SEGMENT # PARTICIPANT_DATASET,TIME_SEGMENT + CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET,TIME_SEGMENT CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS SRC_FOLDER: "doryab" # inside src/features/phone_locations SRC_LANGUAGE: "python" diff --git a/docs/change-log.md b/docs/change-log.md index 7b05e06d..077c261b 100644 --- a/docs/change-log.md +++ b/docs/change-log.md @@ -1,5 +1,8 @@ # Change Log +## next version v0.4.0 +- Add four new phone sensors that can be used for PHONE_DATA_YIELD +- Add code so new feature providers can be added for the new four sensors ## v0.3.2 - Update docker and linux instructions to use RSPM binary repo for for faster installation - Update CI to create a release on a tagged push that passes the tests diff --git a/docs/features/phone-applications-crashes.md b/docs/features/phone-applications-crashes.md new file mode 100644 index 00000000..dcf4cd38 --- /dev/null +++ b/docs/features/phone-applications-crashes.md @@ -0,0 +1,14 @@ +# Phone Applications Crashes + +Sensor parameters description for `[PHONE_APPLICATIONS_CRASHES]`: + +|Key                              | Description | +|----------------|----------------------------------------------------------------------------------------------------------------------------------- +|`[TABLE]`| Database table where the applications crashes data is stored +|`[APPLICATION_CATEGORIES][CATALOGUE_SOURCE]` | `FILE` or `GOOGLE`. If `FILE`, app categories (genres) are read from `[CATALOGUE_FILE]`. If `[GOOGLE]`, app categories (genres) are scrapped from the Play Store +|`[APPLICATION_CATEGORIES][CATALOGUE_FILE]` | CSV file with a `package_name` and `genre` column. By default we provide the catalogue created by [Stachl et al](../../citation#stachl-applications-crashes) in `data/external/stachl_application_genre_catalogue.csv` +|`[APPLICATION_CATEGORIES][UPDATE_CATALOGUE_FILE]` | if `[CATALOGUE_SOURCE]` is equal to `FILE`, this flag signals whether or not to update `[CATALOGUE_FILE]`, if `[CATALOGUE_SOURCE]` is equal to `GOOGLE` all scraped genres will be saved to `[CATALOGUE_FILE]` +|`[APPLICATION_CATEGORIES][SCRAPE_MISSING_CATEGORIES]` | This flag signals whether or not to scrape categories (genres) missing from the `[CATALOGUE_FILE]`. If `[CATALOGUE_SOURCE]` is equal to `GOOGLE`, all genres are scraped anyway (this flag is ignored) + +!!! note + No feature providers have been implemented for this sensor yet, however you can use its key (`PHONE_APPLICATIONS_CRASHES`) to improve [`PHONE_DATA_YIELD`](../phone-data-yield) or you can [implement your own features](../add-new-features). \ No newline at end of file diff --git a/docs/features/phone-applications-notifications.md b/docs/features/phone-applications-notifications.md new file mode 100644 index 00000000..b32afde7 --- /dev/null +++ b/docs/features/phone-applications-notifications.md @@ -0,0 +1,14 @@ +# Phone Applications Notifications + +Sensor parameters description for `[PHONE_APPLICATIONS_NOTIFICATIONS]`: + +|Key                              | Description | +|----------------|----------------------------------------------------------------------------------------------------------------------------------- +|`[TABLE]`| Database table where the applications notifications data is stored +|`[APPLICATION_CATEGORIES][CATALOGUE_SOURCE]` | `FILE` or `GOOGLE`. If `FILE`, app categories (genres) are read from `[CATALOGUE_FILE]`. If `[GOOGLE]`, app categories (genres) are scrapped from the Play Store +|`[APPLICATION_CATEGORIES][CATALOGUE_FILE]` | CSV file with a `package_name` and `genre` column. By default we provide the catalogue created by [Stachl et al](../../citation#stachl-applications-notifications) in `data/external/stachl_application_genre_catalogue.csv` +|`[APPLICATION_CATEGORIES][UPDATE_CATALOGUE_FILE]` | if `[CATALOGUE_SOURCE]` is equal to `FILE`, this flag signals whether or not to update `[CATALOGUE_FILE]`, if `[CATALOGUE_SOURCE]` is equal to `GOOGLE` all scraped genres will be saved to `[CATALOGUE_FILE]` +|`[APPLICATION_CATEGORIES][SCRAPE_MISSING_CATEGORIES]` | This flag signals whether or not to scrape categories (genres) missing from the `[CATALOGUE_FILE]`. If `[CATALOGUE_SOURCE]` is equal to `GOOGLE`, all genres are scraped anyway (this flag is ignored) + +!!! note + No feature providers have been implemented for this sensor yet, however you can use its key (`PHONE_APPLICATIONS_NOTIFICATIONS`) to improve [`PHONE_DATA_YIELD`](../phone-data-yield) or you can [implement your own features](../add-new-features). \ No newline at end of file diff --git a/docs/features/phone-aware-log.md b/docs/features/phone-aware-log.md new file mode 100644 index 00000000..cbeb9d37 --- /dev/null +++ b/docs/features/phone-aware-log.md @@ -0,0 +1,10 @@ +# Phone Aware + +Sensor parameters description for `[PHONE_AWARE_LOG]`: + +|Key                              | Description | +|----------------|----------------------------------------------------------------------------------------------------------------------------------- +|`[TABLE]`| Database table where the aware data is stored + +!!! note + No feature providers have been implemented for this sensor yet, however you can use its key (`PHONE_AWARE_LOG`) to improve [`PHONE_DATA_YIELD`](../phone-data-yield) or you can [implement your own features](../add-new-features). \ No newline at end of file diff --git a/docs/features/phone-data-yield.md b/docs/features/phone-data-yield.md index 5327a131..4f6e5185 100644 --- a/docs/features/phone-data-yield.md +++ b/docs/features/phone-data-yield.md @@ -9,18 +9,22 @@ Sensor parameters description for `[PHONE_DATA_YIELD]`: |Key                    | Description | |----------------|----------------------------------------------------------------------------------------------------------------------------------- -|`[SENSORS]`| One or more phone sensor config keys (e.g. `PHONE_MESSAGE`). The more keys you include the more accurately RAPIDS can approximate the time an smartphone was sensing data. The supported phone sensors you can include in this list are outlined below (**do NOT include Fitbit sensors**). +|`[SENSORS]`| One or more phone sensor config keys (e.g. `PHONE_MESSAGE`). The more keys you include the more accurately RAPIDS can approximate the time an smartphone was sensing data. The supported phone sensors you can include in this list are outlined below (**do NOT include Fitbit sensors, ONLY include phone sensors**). !!! info "Supported phone sensors for `[PHONE_DATA_YIELD][SENSORS]`" ```yaml PHONE_ACCELEROMETER PHONE_ACTIVITY_RECOGNITION + PHONE_APPLICATIONS_CRASHES PHONE_APPLICATIONS_FOREGROUND + PHONE_APPLICATIONS_NOTIFICATIONS + PHONE_AWARE_LOG PHONE_BATTERY PHONE_BLUETOOTH PHONE_CALLS PHONE_CONVERSATION PHONE_MESSAGES + PHONE_KEYBOARD PHONE_LIGHT PHONE_LOCATIONS PHONE_SCREEN diff --git a/docs/features/phone-keyboard.md b/docs/features/phone-keyboard.md new file mode 100644 index 00000000..4aba2c62 --- /dev/null +++ b/docs/features/phone-keyboard.md @@ -0,0 +1,10 @@ +# Phone Keyboard + +Sensor parameters description for `[PHONE_KEYBOARD]`: + +|Key                              | Description | +|----------------|----------------------------------------------------------------------------------------------------------------------------------- +|`[TABLE]`| Database table where the keyboard data is stored + +!!! note + No feature providers have been implemented for this sensor yet, however you can use its key (`PHONE_KEYBOARD`) to improve [`PHONE_DATA_YIELD`](../phone-data-yield) or you can [implement your own features](../add-new-features). \ No newline at end of file diff --git a/docs/features/phone-locations.md b/docs/features/phone-locations.md index 1529525e..0471e83a 100644 --- a/docs/features/phone-locations.md +++ b/docs/features/phone-locations.md @@ -5,13 +5,13 @@ Sensor parameters description for `[PHONE_LOCATIONS]`: |Key                                                                                        | Description | |----------------|----------------------------------------------------------------------------------------------------------------------------------- |`[TABLE]`| Database table where the location data is stored -|`[LOCATIONS_TO_USE]`| Type of location data to use, one of `ALL`, `GPS` or `FUSED_RESAMPLED`. This filter is based on the `provider` column of the AWARE locations table, `ALL` includes every row, `GPS` only includes rows where provider is gps, and `FUSED_RESAMPLED` only includes rows where provider is fused after being resampled. -|`[FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD]`| if `FUSED_RESAMPLED` is used, the original fused data has to be resampled, a location row will be resampled to the next valid timestamp (see the Assumptions/Observations below) only if the time difference between them is less or equal than this threshold (in minutes). -|`[FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION]`| if `FUSED_RESAMPLED` is used, the original fused data has to be resampled, a location row will be resampled at most for this long (in minutes) +|`[LOCATIONS_TO_USE]`| Type of location data to use, one of `ALL`, `GPS`, `ALL_RESAMPLED` or `FUSED_RESAMPLED`. This filter is based on the `provider` column of the AWARE locations table, `ALL` includes every row, `GPS` only includes rows where provider is gps, `ALL_RESAMPLED` includes all rows after being resampled, and `FUSED_RESAMPLED` only includes rows where provider is fused after being resampled. +|`[FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD]`| if `ALL_RESAMPLED` or `FUSED_RESAMPLED` is used, the original fused data has to be resampled, a location row will be resampled to the next valid timestamp (see the Assumptions/Observations below) only if the time difference between them is less or equal than this threshold (in minutes). +|`[FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION]`| if `ALL_RESAMPLED` or `FUSED_RESAMPLED` is used, the original fused data has to be resampled, a location row will be resampled at most for this long (in minutes) !!! note "Assumptions/Observations" **Types of location data to use** - AWARE Android and iOS clients can collect location coordinates through the phone\'s GPS, the network cellular towers around the phone or Google\'s fused location API. If you want to use only the GPS provider set `[LOCATIONS_TO_USE]` to `GPS`, if you want to use all providers (not recommended due to the difference in accuracy) set `[LOCATIONS_TO_USE]` to `ALL`, if your AWARE client was configured to use fused location only or want to focus only on this provider, set `[LOCATIONS_TO_USE]` to `RESAMPLE_FUSED`. `RESAMPLE_FUSED` takes the original fused location coordinates and replicates each pair forward in time as long as the phone was sensing data as indicated by the joined timestamps of [`[PHONE_DATA_YIELD][SENSORS]`](../phone-data-yield/), this is done because Google\'s API only logs a new location coordinate pair when it is sufficiently different in time or space from the previous one. + AWARE Android and iOS clients can collect location coordinates through the phone\'s GPS, the network cellular towers around the phone, or Google\'s fused location API. If you want to use only the GPS provider set `[LOCATIONS_TO_USE]` to `GPS`, if you want to use all providers set `[LOCATIONS_TO_USE]` to `ALL`, if you collected location data from different providers including the fused API use `ALL_RESAMPLED`, if your AWARE client was configured to use fused location only or want to focus only on this provider, set `[LOCATIONS_TO_USE]` to `RESAMPLE_FUSED`. `ALL_RESAMPLED` and `RESAMPLE_FUSED` take the original location coordinates and replicate each pair forward in time as long as the phone was sensing data as indicated by the joined timestamps of [`[PHONE_DATA_YIELD][SENSORS]`](../phone-data-yield/), this is done because Google\'s API only logs a new location coordinate pair when it is sufficiently different in time or space from the previous one and because GPS and network providers can log data at variable rates. There are two parameters associated with resampling fused location. `FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair (for example, participant A\'s phone did not collect data between 10.30am and 10:50am and between 11:05am and 11:40am, the last known coordinate pair will be replicated during the first period but not the second, in other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes). `FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION` (in minutes, default 720 or 12 hours) stops the last known fused location from being replicated longer that this threshold even if the phone was sensing data continuously (for example, participant A went home at 9pm and their phone was sensing data without gaps until 11am the next morning, the last known location will only be replicated until 9am). If you have suggestions to modify or improve this resampling, let us know. @@ -100,6 +100,7 @@ Parameters description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`: |----------------|----------------------------------------------------------------------------------------------------------------------------------- |`[COMPUTE]`| Set to `True` to extract `PHONE_LOCATIONS` features from the `BARNETT` provider| |`[FEATURES]` | Features to be computed, see table below +|`[ACCURACY_LIMIT]` | An integer in meters, any location rows with an accuracy higher than this will be dropped. This number means there's a 68% probability the true location is within this radius | `[DBSCAN_EPS]` | The maximum distance in meters between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. | `[DBSCAN_MINSAMPLES]` | The number of samples (or total weight) in a neighborhood for a point to be considered as a core point of a cluster. This includes the point itself. | `[THRESHOLD_STATIC]` | It is the threshold value in km/hr which labels a row as Static or Moving. diff --git a/mkdocs.yml b/mkdocs.yml index a11cd595..c6f03c63 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -82,12 +82,16 @@ nav: - Phone: - Phone Accelerometer: features/phone-accelerometer.md - Phone Activity Recognition: features/phone-activity-recognition.md + - Phone Applications Crashes: features/phone-applications-crashes.md - Phone Applications Foreground: features/phone-applications-foreground.md + - Phone Applications Notifications: features/phone-applications-notifications.md + - Phone Aware Log: features/phone-aware-log.md - Phone Battery: features/phone-battery.md - Phone Bluetooth: features/phone-bluetooth.md - Phone Calls: features/phone-calls.md - Phone Conversation: features/phone-conversation.md - Phone Data Yield: features/phone-data-yield.md + - Phone Keyboard: features/phone-keyboard.md - Phone Light: features/phone-light.md - Phone Locations: features/phone-locations.md - Phone Messages: features/phone-messages.md diff --git a/rules/common.smk b/rules/common.smk index ef9af1ca..a496be7f 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -15,10 +15,18 @@ def optional_steps_sleep_input(wildcards): def input_merge_sensor_features_for_individual_participants(wildcards): feature_files = [] for config_key in config.keys(): - if config_key.startswith(("PHONE", "FITBIT")) and "PROVIDERS" in config[config_key]: + if config_key.startswith(("PHONE", "FITBIT")) and "PROVIDERS" in config[config_key] and isinstance(config[config_key]["PROVIDERS"], dict): for provider_key, provider in config[config_key]["PROVIDERS"].items(): if "COMPUTE" in provider.keys() and provider["COMPUTE"]: feature_files.append("data/processed/features/{pid}/" + config_key.lower() + ".csv") break return feature_files +def get_phone_sensor_names(): + phone_sensor_names = [] + for config_key in config.keys(): + if config_key.startswith(("PHONE")) and "PROVIDERS" in config[config_key]: + if config_key != "PHONE_DATA_YIELD" and config_key not in phone_sensor_names: + phone_sensor_names.append(config_key) + return phone_sensor_names + diff --git a/rules/features.smk b/rules/features.smk index c106912f..d66fbf19 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -96,6 +96,32 @@ rule phone_activity_recognition_r_features: script: "../src/features/entry.R" +rule phone_applications_crashes_python_features: + input: + sensor_data = "data/raw/{pid}/phone_applications_crashes_with_datetime_with_categories.csv", + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" + params: + provider = lambda wildcards: config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"][wildcards.provider_key.upper()], + provider_key = "{provider_key}", + sensor_key = "phone_applications_crashes" + output: + "data/interim/{pid}/phone_applications_crashes_features/phone_applications_crashes_python_{provider_key}.csv" + script: + "../src/features/entry.py" + +rule phone_applications_crashes_r_features: + input: + sensor_data = "data/raw/{pid}/phone_applications_crashes_with_datetime_with_categories.csv", + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" + params: + provider = lambda wildcards: config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"][wildcards.provider_key.upper()], + provider_key = "{provider_key}", + sensor_key = "phone_applications_crashes" + output: + "data/interim/{pid}/phone_applications_crashes_features/phone_applications_crashes_r_{provider_key}.csv" + script: + "../src/features/entry.R" + rule phone_applications_foreground_python_features: input: sensor_data = "data/raw/{pid}/phone_applications_foreground_with_datetime_with_categories.csv", @@ -122,6 +148,58 @@ rule phone_applications_foreground_r_features: script: "../src/features/entry.R" +rule phone_applications_notifications_python_features: + input: + sensor_data = "data/raw/{pid}/phone_applications_notifications_with_datetime_with_categories.csv", + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" + params: + provider = lambda wildcards: config["PHONE_APPLICATIONS_NOTIFICATIONS"]["PROVIDERS"][wildcards.provider_key.upper()], + provider_key = "{provider_key}", + sensor_key = "phone_applications_notifications" + output: + "data/interim/{pid}/phone_applications_notifications_features/phone_applications_notifications_python_{provider_key}.csv" + script: + "../src/features/entry.py" + +rule phone_applications_notifications_r_features: + input: + sensor_data = "data/raw/{pid}/phone_applications_notifications_with_datetime_with_categories.csv", + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" + params: + provider = lambda wildcards: config["PHONE_APPLICATIONS_NOTIFICATIONS"]["PROVIDERS"][wildcards.provider_key.upper()], + provider_key = "{provider_key}", + sensor_key = "phone_applications_notifications" + output: + "data/interim/{pid}/phone_applications_notifications_features/phone_applications_notifications_r_{provider_key}.csv" + script: + "../src/features/entry.R" + +rule phone_aware_log_python_features: + input: + sensor_data = "data/raw/{pid}/phone_aware_log_with_datetime.csv", + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" + params: + provider = lambda wildcards: config["PHONE_AWARE_LOG"]["PROVIDERS"][wildcards.provider_key.upper()], + provider_key = "{provider_key}", + sensor_key = "phone_aware_log" + output: + "data/interim/{pid}/phone_aware_log_features/phone_aware_log_python_{provider_key}.csv" + script: + "../src/features/entry.py" + +rule phone_aware_log_r_features: + input: + sensor_data = "data/raw/{pid}/phone_aware_log_with_datetime.csv", + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" + params: + provider = lambda wildcards: config["PHONE_AWARE_LOG"]["PROVIDERS"][wildcards.provider_key.upper()], + provider_key = "{provider_key}", + sensor_key = "phone_aware_log" + output: + "data/interim/{pid}/phone_aware_log_features/phone_aware_log_r_{provider_key}.csv" + script: + "../src/features/entry.R" + rule battery_episodes: input: "data/raw/{pid}/phone_battery_raw.csv" @@ -236,6 +314,32 @@ rule conversation_r_features: script: "../src/features/entry.R" +rule phone_keyboard_python_features: + input: + sensor_data = "data/raw/{pid}/phone_keyboard_with_datetime.csv", + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" + params: + provider = lambda wildcards: config["PHONE_KEYBOARD"]["PROVIDERS"][wildcards.provider_key.upper()], + provider_key = "{provider_key}", + sensor_key = "phone_keyboard" + output: + "data/interim/{pid}/phone_keyboard_features/phone_keyboard_python_{provider_key}.csv" + script: + "../src/features/entry.py" + +rule phone_keyboard_r_features: + input: + sensor_data = "data/raw/{pid}/phone_keyboard_with_datetime.csv", + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" + params: + provider = lambda wildcards: config["PHONE_KEYBOARD"]["PROVIDERS"][wildcards.provider_key.upper()], + provider_key = "{provider_key}", + sensor_key = "phone_keyboard" + output: + "data/interim/{pid}/phone_keyboard_features/phone_keyboard_r_{provider_key}.csv" + script: + "../src/features/entry.R" + rule phone_light_python_features: input: sensor_data = "data/raw/{pid}/phone_light_with_datetime.csv", diff --git a/rules/preprocessing.smk b/rules/preprocessing.smk index 8f754885..e660d13a 100644 --- a/rules/preprocessing.smk +++ b/rules/preprocessing.smk @@ -163,14 +163,14 @@ rule resample_episodes_with_datetime: rule phone_application_categories: input: - "data/raw/{pid}/phone_applications_foreground_with_datetime.csv" + "data/raw/{pid}/phone_applications_{type}_with_datetime.csv" params: - catalogue_source = config["PHONE_APPLICATIONS_FOREGROUND"]["APPLICATION_CATEGORIES"]["CATALOGUE_SOURCE"], - catalogue_file = config["PHONE_APPLICATIONS_FOREGROUND"]["APPLICATION_CATEGORIES"]["CATALOGUE_FILE"], - update_catalogue_file = config["PHONE_APPLICATIONS_FOREGROUND"]["APPLICATION_CATEGORIES"]["UPDATE_CATALOGUE_FILE"], - scrape_missing_genres = config["PHONE_APPLICATIONS_FOREGROUND"]["APPLICATION_CATEGORIES"]["SCRAPE_MISSING_CATEGORIES"] + catalogue_source = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["CATALOGUE_SOURCE"], + catalogue_file = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["CATALOGUE_FILE"], + update_catalogue_file = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["UPDATE_CATALOGUE_FILE"], + scrape_missing_genres = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["SCRAPE_MISSING_CATEGORIES"] output: - "data/raw/{pid}/phone_applications_foreground_with_datetime_with_categories.csv" + "data/raw/{pid}/phone_applications_{type}_with_datetime_with_categories.csv" script: "../src/data/application_categories.R" diff --git a/src/data/process_location_types.R b/src/data/process_location_types.R index 141ae417..ce77d9f2 100644 --- a/src/data/process_location_types.R +++ b/src/data/process_location_types.R @@ -12,21 +12,34 @@ locations <- read.csv(snakemake@input[["locations"]]) %>% filter(double_latitude != 0 & double_longitude != 0) %>% drop_na(double_longitude, double_latitude) -if(!locations_to_use %in% c("ALL", "FUSED_RESAMPLED", "GPS")){ - print("Unkown location filter, provide one of the following three: ALL, GPS, or FUSED_RESAMPLED") +if(!locations_to_use %in% c("ALL", "FUSED_RESAMPLED", "GPS", "ALL_RESAMPLED")){ + print("Unkown location filter, provide one of the following three: ALL, GPS, ALL_RESAMPLED, or FUSED_RESAMPLED") quit(save = "no", status = 1, runLast = FALSE) } +# keep the location row that has the best (lowest) accuracy if more than 1 row was logged within any 1 second +if(locations_to_use %in% c("FUSED_RESAMPLED", "ALL_RESAMPLED")) + locations <- locations %>% drop_na(double_longitude, double_latitude) %>% + mutate(minute_bin = timestamp %/% 1001) %>% + group_by(minute_bin) %>% + slice(which.min(accuracy)) %>% + ungroup() %>% + select(-minute_bin) if(locations_to_use == "ALL"){ processed_locations <- locations } else if(locations_to_use == "GPS"){ processed_locations <- locations %>% filter(provider == "gps") -} else if(locations_to_use == "FUSED_RESAMPLED"){ - locations <- locations %>% filter(provider == "fused") +} else if(locations_to_use %in% c("FUSED_RESAMPLED", "ALL_RESAMPLED")){ + if (locations_to_use == "FUSED_RESAMPLED"){ + locations <- locations %>% filter(provider == "fused") + providers_to_keep = c("fused") + } else if(locations_to_use == "ALL_RESAMPLED"){ + providers_to_keep = c("fused", "gps", "network") + } + if(nrow(locations) > 0){ processed_locations <- locations %>% - # TODO filter repeated location rows based on the accurcy distinct(timestamp, .keep_all = TRUE) %>% bind_rows(phone_sensed_timestamps) %>% arrange(timestamp) %>% @@ -37,7 +50,7 @@ if(locations_to_use == "ALL"){ group_by(resample_group) %>% # Filter those rows that are further away than time_since_valid_location since the last fused location mutate(time_from_fused = timestamp - first(timestamp)) %>% - filter(provider == "fused" | (time_from_fused < (1000 * 60 * time_since_valid_location))) %>% + filter(provider %in% providers_to_keep | (time_from_fused < (1000 * 60 * time_since_valid_location))) %>% # Summarise the period to resample for summarise(limit = max(timestamp), timestamp = first(timestamp), double_latitude = first(double_latitude), double_longitude = first(double_longitude), double_bearing=first(double_bearing), double_speed = first(double_speed), double_altitude=first(double_altitude), provider=first(provider), diff --git a/src/features/phone_locations/barnett/main.R b/src/features/phone_locations/barnett/main.R index b305be20..26d54224 100644 --- a/src/features/phone_locations/barnett/main.R +++ b/src/features/phone_locations/barnett/main.R @@ -65,9 +65,9 @@ barnett_features <- function(sensor_data_files, time_segment, params){ # Some minutes have multiple fused rows location_minutes_used <- location %>% group_by(local_date, local_hour) %>% - summarise(n_minutes = n_distinct(local_minute)) %>% + summarise(n_minutes = n_distinct(local_minute), .groups = 'drop_last') %>% group_by(local_date) %>% - summarise(minutes_data_used = sum(n_minutes)) %>% + summarise(minutes_data_used = sum(n_minutes), .groups = 'drop_last') %>% select(local_date, minutes_data_used) # Save time segment to attach it later @@ -78,7 +78,7 @@ barnett_features <- function(sensor_data_files, time_segment, params){ if(nrow(location %>% filter(accuracy < accuracy_limit)) > 1){ outputMobility <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone) } else { - print(paste("Cannot compute location features because there are no rows with an accuracy value lower than ACCURACY_LIMIT", accuracy_limit)) + print(paste("Cannot compute Barnett location features because there are no rows with an accuracy value lower than ACCURACY_LIMIT", accuracy_limit)) outputMobility <- NULL } diff --git a/src/features/phone_locations/doryab/main.py b/src/features/phone_locations/doryab/main.py index 42c63c99..d85ce4e2 100644 --- a/src/features/phone_locations/doryab/main.py +++ b/src/features/phone_locations/doryab/main.py @@ -1,5 +1,6 @@ import pandas as pd import numpy as np +import warnings from astropy.timeseries import LombScargle from sklearn.cluster import DBSCAN,OPTICS from math import radians, cos, sin, asin, sqrt @@ -8,6 +9,7 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se location_data = pd.read_csv(sensor_data_files["sensor_data"]) requested_features = provider["FEATURES"] + accuracy_limit = provider["ACCURACY_LIMIT"] dbscan_eps = provider["DBSCAN_EPS"] dbscan_minsamples = provider["DBSCAN_MINSAMPLES"] threshold_static = provider["THRESHOLD_STATIC"] @@ -31,6 +33,11 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se hyperparameters = {'max_eps': distance_to_degrees(dbscan_eps), 'min_samples': 2, 'metric':'euclidean', 'cluster_method' : 'dbscan'} else: raise ValueError("config[PHONE_LOCATIONS][DORYAB][CLUSTERING ALGORITHM] only accepts DBSCAN or OPTICS but you provided ",clustering_algorithm) + + rows_before_accuracy_filter = len(location_data) + location_data.query("accuracy < @accuracy_limit", inplace=True) + if rows_before_accuracy_filter > 0 and len(location_data) == 0: + warnings.warn("Cannot compute Doryab location features because there are no rows with an accuracy value lower than ACCURACY_LIMIT: {}".format(accuracy_limit)) if location_data.empty: location_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)