Migrate location providers to new file structure and segments

pull/103/head
JulioV 2020-08-28 13:53:00 -04:00
parent 3052693573
commit b0f1477d7e
57 changed files with 547 additions and 440 deletions

View File

@ -43,17 +43,6 @@ if config["CALLS"]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"]))
files_to_compute.extend(expand("data/processed/{pid}/calls_{call_type}.csv", pid=config["PIDS"], call_type=config["CALLS"]["TYPES"])) files_to_compute.extend(expand("data/processed/{pid}/calls_{call_type}.csv", pid=config["PIDS"], call_type=config["CALLS"]["TYPES"]))
if config["BARNETT_LOCATION"]["COMPUTE"]:
if config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED":
if config["BARNETT_LOCATION"]["DB_TABLE"] in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]:
files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_resampled.csv", pid=config["PIDS"], sensor=config["BARNETT_LOCATION"]["DB_TABLE"]))
else:
raise ValueError("Error: Add your locations table (and as many sensor tables as you have) to [PHONE_VALID_SENSED_BINS][DB_TABLES] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)")
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["BARNETT_LOCATION"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["BARNETT_LOCATION"]["DB_TABLE"]))
files_to_compute.extend(expand("data/processed/{pid}/location_barnett_{day_segment}.csv", pid=config["PIDS"], day_segment = config["BARNETT_LOCATION"]["DAY_SEGMENTS"]))
if config["BLUETOOTH"]["COMPUTE"]: if config["BLUETOOTH"]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["BLUETOOTH"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["BLUETOOTH"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["BLUETOOTH"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["BLUETOOTH"]["DB_TABLE"]))
@ -142,16 +131,19 @@ if config["CONVERSATION"]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=pids, sensor=table)) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=pids, sensor=table))
files_to_compute.extend(expand("data/processed/{pid}/conversation_{day_segment}.csv",pid=config["PIDS"], day_segment = config["CONVERSATION"]["DAY_SEGMENTS"])) files_to_compute.extend(expand("data/processed/{pid}/conversation_{day_segment}.csv",pid=config["PIDS"], day_segment = config["CONVERSATION"]["DAY_SEGMENTS"]))
if config["DORYAB_LOCATION"]["COMPUTE"]: for provider in config["LOCATIONS"]["PROVIDERS"].keys():
if config["DORYAB_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED": if config["LOCATIONS"]["PROVIDERS"][provider]["COMPUTE"]:
if config["DORYAB_LOCATION"]["DB_TABLE"] in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]: if config["LOCATIONS"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED":
if config["LOCATIONS"]["DB_TABLE"] in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]:
files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_resampled.csv", pid=config["PIDS"], sensor=config["DORYAB_LOCATION"]["DB_TABLE"]))
else: else:
raise ValueError("Error: Add your locations table (and as many sensor tables as you have) to [PHONE_VALID_SENSED_BINS][DB_TABLES] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)") raise ValueError("Error: Add your locations table (and as many sensor tables as you have) to [PHONE_VALID_SENSED_BINS][DB_TABLES] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)")
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["DORYAB_LOCATION"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["DORYAB_LOCATION"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["LOCATIONS"]["DB_TABLE"]))
files_to_compute.extend(expand("data/processed/{pid}/location_doryab_{segment}.csv", pid=config["PIDS"], segment = config["DORYAB_LOCATION"]["DAY_SEGMENTS"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["LOCATIONS"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_processed_{locations_to_use}.csv", pid=config["PIDS"], sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"]))
files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["LOCATIONS"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="LOCATIONS".lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="LOCATIONS".lower()))
# visualization for data exploration # visualization for data exploration
if config["HEATMAP_FEATURES_CORRELATIONS"]["PLOT"]: if config["HEATMAP_FEATURES_CORRELATIONS"]["PLOT"]:

View File

@ -67,33 +67,34 @@ APPLICATION_GENRES:
UPDATE_CATALOGUE_FILE: false # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE UPDATE_CATALOGUE_FILE: false # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
SCRAPE_MISSING_GENRES: false # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway SCRAPE_MISSING_GENRES: false # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
RESAMPLE_FUSED_LOCATION: LOCATIONS:
CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold DB_TABLE: locations
TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row LOCATIONS_TO_USE: FUSED_RESAMPLED # ALL, GPS OR FUSED_RESAMPLED
FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold
FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row
TIMEZONE: *timezone TIMEZONE: *timezone
BARNETT_LOCATION: PROVIDERS:
COMPUTE: False DORYAB:
DB_TABLE: locations COMPUTE: True
DAY_SEGMENTS: [daily] # These features are only available on a daily basis
FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"]
LOCATIONS_TO_USE: ALL # ALL, ALL_EXCEPT_FUSED OR RESAMPLE_FUSED
ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius
TIMEZONE: *timezone
MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features
DORYAB_LOCATION:
COMPUTE: False
DB_TABLE: locations
DAY_SEGMENTS: *day_segments
FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"] FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"]
LOCATIONS_TO_USE: ALL # ALL, ALL_EXCEPT_FUSED OR RESAMPLE_FUSED
DBSCAN_EPS: 10 # meters DBSCAN_EPS: 10 # meters
DBSCAN_MINSAMPLES: 5 DBSCAN_MINSAMPLES: 5
THRESHOLD_STATIC : 1 # km/h THRESHOLD_STATIC : 1 # km/h
MAXIMUM_GAP_ALLOWED: 300 MAXIMUM_GAP_ALLOWED: 300
MINUTES_DATA_USED: False MINUTES_DATA_USED: True
SAMPLING_FREQUENCY: 0 SAMPLING_FREQUENCY: 0
SRC_FOLDER: "doryab"
SRC_LANGUAGE: "python"
BARNETT:
COMPUTE: True
FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"]
ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius
TIMEZONE: *timezone
MINUTES_DATA_USED: True # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features
SRC_FOLDER: "barnett"
SRC_LANGUAGE: "r"
BLUETOOTH: BLUETOOTH:
COMPUTE: False COMPUTE: False

View File

@ -579,17 +579,18 @@ are computed. See Ian Barnett, Jukka-Pekka Onnela, Inferring mobility measures f
See `Location (Barnetts) Config Code`_ See `Location (Barnetts) Config Code`_
**Available Epochs (day_segment) :** daily **Available Day Segments (epochs) :** only daily periods of EVERY_DAY_INTERVAL or FLEXIBLE_DAY_INTERVAL (periods that start at 00:00:00 and end at 23:59:59 on the same day)
**Available Platforms:** Android and iOS **Available Platforms:** Android and iOS
**Snakemake rule chain:** **Snakemake rule chain:**
- Rule ``rules/preprocessing.snakefile/download_dataset`` - Rule ``rules/preprocessing.snakefile/download_dataset`` (de duplication and sorting by timestamp)
- Rule ``rules/preprocessing.snakefile/readable_datetime`` - Rule ``rules/preprocessing.snakefile/readable_datetime`` (add local date and time components, add local day segment)
- Rule ``rules/preprocessing.snakefile/phone_sensed_bins`` - Rule ``rules/preprocessing.snakefile/phone_sensed_bins`` (get the periods of time the phone was sensing data to resample over them)
- Rule ``rules/preprocessing.snakefile/resample_fused_location`` (only relevant if setting ``location_to_use`` to ````RESAMPLE_FUSED``. - Rule ``rules/preprocessing.snakefile/process_location_types`` (filter gps data or resample fused location, deletes (0,0) coordinates)
- Rule ``rules/features.snakefile/location_barnett_features`` - Rule ``rules/features.snakefile/locations_r_features`` (RAPIDS executes ``barnett_location_features`` from ``src/features/location/barnett/main.R`)
- Rule ``rules/features.snakefile/join_features_from_providers`` (joins the location features of all python and r providers)
.. _location-parameters: .. _location-parameters:
@ -598,7 +599,7 @@ See `Location (Barnetts) Config Code`_
================= =================== ================= ===================
Name Description Name Description
================= =================== ================= ===================
location_to_use *Read the Observations section below*. The specifies what type of location data will be use in the analysis. Possible options are ``ALL``, ``ALL_EXCEPT_FUSED`` OR ``RESAMPLE_FUSED`` location_to_use *Read the Observations section below*. The specifies what type of location data will be use in the analysis. Possible options are ``ALL``, ``GPS`` OR ``RESAMPLE_FUSED``
accuracy_limit This is in meters. The sensor drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius specified. accuracy_limit This is in meters. The sensor drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius specified.
timezone The timezone used to calculate location. timezone The timezone used to calculate location.
minutes_data_used This is NOT a feature. This is just a quality control check, and if set to TRUE, a new column is added to the output file with the number of minutes containing location data that were used to compute all features. The more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough. minutes_data_used This is NOT a feature. This is just a quality control check, and if set to TRUE, a new column is added to the output file with the number of minutes containing location data that were used to compute all features. The more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough.
@ -634,15 +635,15 @@ wkenddayrtn Same as circdnrtn but computed separately for w
*Types of location data to use* *Types of location data to use*
Aware Android and iOS clients can collect location coordinates through the phone's GPS or Google's fused location API. If your Aware client was ONLY configured to use GPS set ``location_to_use`` to ``ALL``, if your client was configured to use BOTH GPS and fused location you can use ``ALL`` or set ``location_to_use`` to ``ALL_EXCEPT_FUSED`` to ignore fused coordinates, if your client was configured to use fused location only, set ``location_to_use`` to ``RESAMPLE_FUSED``. ``RESAMPLE_FUSED`` takes the original fused location coordinates and replicates each pair forward in time as long as the phone was sensing data as indicated by ``phone_sensed_bins`` (see :ref:`Phone valid sensed days <phone-valid-sensed-days>`), this is done because Google's API only logs a new location coordinate pair when it is sufficiently different from the previous one. Aware Android and iOS clients can collect location coordinates through the phone's GPS, the network cellular towers around the phone or Google's fused location API. If you want to use only the GPS provider set ``location_to_use`` to ``GPS``, if you want to use all providers (not recommended due to the difference in accuracy) set ``location_to_use`` to ``ALL``, if your Aware client was configured to use fused location only or want to focus only on this provider, set ``location_to_use`` to ``RESAMPLE_FUSED``. ``RESAMPLE_FUSED`` takes the original fused location coordinates and replicates each pair forward in time as long as the phone was sensing data as indicated by ``phone_sensed_bins`` (see :ref:`Phone valid sensed days <phone-valid-sensed-days>`), this is done because Google's API only logs a new location coordinate pair when it is sufficiently different in time or space from the previous one.
There are two parameters associated with resampling fused location in the ``RESAMPLE_FUSED_LOCATION`` section of the ``config.yaml`` file. ``CONSECUTIVE_THRESHOLD`` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair (for example, participant A's phone did not collect data between 10.30am and 10:50am and between 11:05am and 11:40am, the last known coordinate pair will be replicated during the first period but not the second, in other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes). ``TIME_SINCE_VALID_LOCATION`` (in minutes, default 720 or 12 hours) the last known fused location won't be carried over longer that this threshold even if the phone was sensing data continuously (for example, participant A went home at 9pm and their phone was sensing data without gaps until 11am the next morning, the last known location will only be replicated until 9am). If you have suggestions to modify or improve this imputation, let us know. There are two parameters associated with resampling fused location in the ``LOCATIONS`` section of the ``config.yaml`` file. ``RESAMPLE_FUSED_CONSECUTIVE_THRESHOLD`` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair (for example, participant A's phone did not collect data between 10.30am and 10:50am and between 11:05am and 11:40am, the last known coordinate pair will be replicated during the first period but not the second, in other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes). ``RESAMPLE_FUSED_TIME_SINCE_VALID_LOCATION`` (in minutes, default 720 or 12 hours) makes that the last known fused location won't be carried over longer that this threshold even if the phone was sensing data continuously (for example, participant A went home at 9pm and their phone was sensing data without gaps until 11am the next morning, the last known location will only be replicated until 9am). If you have suggestions to modify or improve this imputation, let us know.
*Barnett's et al features* *Barnett's et al features*
These features are based on a Pause-Flight model. A pause is defined as a mobiity trace (location pings) within a certain duration and distance (by default 300 seconds and 60 meters). A flight is any mobility trace between two pauses. Data is resampled and imputed before the features are computed. See this paper for more information: https://doi.org/10.1093/biostatistics/kxy059. These features are based on a Pause-Flight model. A pause is defined as a mobiity trace (location pings) within a certain duration and distance (by default 300 seconds and 60 meters). A flight is any mobility trace between two pauses. Data is resampled and imputed before the features are computed. See this paper for more information: https://doi.org/10.1093/biostatistics/kxy059.
In RAPIDS we only expose two parameters for these features (timezone and accuracy). If you wish to change others you can do so in ``src/features/location_barnett/MobilityFeatures.R`` In RAPIDS we only expose two parameters for these features (timezone and accuracy). If you wish to change others you can do so in ``src/features/location/barnett/library/MobilityFeatures.R``
*Significant Locations* *Significant Locations*
@ -660,17 +661,18 @@ Doryab's location features are based on this paper: Doryab, A., Chikarsel, P., L
See `Location (Doryab's) Config Code`_ See `Location (Doryab's) Config Code`_
**Available Epochs (day_segment) :** daily, morning, afternoon, evening, night **Available Day Segments (epochs):** any of EVERY_DAY_FREQUENCY, EVERY_DAY_INTERVAL and FLEXIBLE_DAY_INTERVAL
**Available Platforms:** Android and iOS **Available Platforms:** Android and iOS
**Snakemake rule chain:** **Snakemake rule chain:**
- Rule ``rules/preprocessing.snakefile/download_dataset`` - Rule ``rules/preprocessing.snakefile/download_dataset`` (de duplication and sorting by timestamp)
- Rule ``rules/preprocessing.snakefile/readable_datetime`` - Rule ``rules/preprocessing.snakefile/readable_datetime`` (add local date and time components, add local day segment)
- Rule ``rules/preprocessing.snakefile/phone_sensed_bins`` - Rule ``rules/preprocessing.snakefile/phone_sensed_bins`` (get the periods of time the phone was sensing data to resample over them)
- Rule ``rules/preprocessing.snakefile/resample_fused_location`` (only relevant if setting ``location_to_use`` to ````RESAMPLE_FUSED``. - Rule ``rules/preprocessing.snakefile/process_location_types`` (filter gps data or resample fused location, deletes (0,0) coordinates)
- Rule ``rules/features.snakefile/location_doryab_features`` - Rule ``rules/features.snakefile/locations_python_features`` (RAPIDS executes ``doryab_location_features`` from ``src/features/location/doryab/main.py`)
- Rule ``rules/features.snakefile/join_features_from_providers`` (joins the location features of all python and r providers)
.. _location-doryab-parameters: .. _location-doryab-parameters:
@ -680,7 +682,7 @@ See `Location (Doryab's) Config Code`_
Name Description Name Description
=================== =================== =================== ===================
day_segment The particular ``day_segment`` that will be analyzed. The available options are ``daily``, ``morning``, ``afternoon``, ``evening``, ``night`` day_segment The particular ``day_segment`` that will be analyzed. The available options are ``daily``, ``morning``, ``afternoon``, ``evening``, ``night``
location_to_use *Read the Observations section below*. The specifies what type of location data will be use in the analysis. Possible options are ``ALL``, ``ALL_EXCEPT_FUSED`` OR ``RESAMPLE_FUSED``. location_to_use *Read the Observations section below*. The specifies what type of location data will be use in the analysis. Possible options are ``ALL``, ``GPS`` OR ``RESAMPLE_FUSED``.
features Features to be computed, see table below. features Features to be computed, see table below.
threshold_static It is the threshold value in km/hr which labels a row as Static or Moving. threshold_static It is the threshold value in km/hr which labels a row as Static or Moving.
dbscan_minsamples The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. dbscan_minsamples The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself.
@ -723,9 +725,9 @@ normalizedlocationentropy nats Shannon Entropy computed
*Types of location data to use* *Types of location data to use*
Aware Android and iOS clients can collect location coordinates through the phone's GPS or Google's fused location API. If your Aware client was ONLY configured to use GPS set ``location_to_use`` to ``ALL``, if your client was configured to use BOTH GPS and fused location you can use ``ALL`` or set ``location_to_use`` to ``ALL_EXCEPT_FUSED`` to ignore fused coordinates, if your client was configured to use fused location only, set ``location_to_use`` to ``RESAMPLE_FUSED``. ``RESAMPLE_FUSED`` takes the original fused location coordinates and replicates each pair forward in time as long as the phone was sensing data as indicated by ``phone_sensed_bins`` (see :ref:`Phone valid sensed days <phone-valid-sensed-days>`), this is done because Google's API only logs a new location coordinate pair when it is sufficiently different from the previous one. Aware Android and iOS clients can collect location coordinates through the phone's GPS, the network cellular towers around the phone or Google's fused location API. If you want to use only the GPS provider set ``location_to_use`` to ``GPS``, if you want to use all providers (not recommended due to the difference in accuracy) set ``location_to_use`` to ``ALL``, if your Aware client was configured to use fused location only or want to focus only on this provider, set ``location_to_use`` to ``RESAMPLE_FUSED``. ``RESAMPLE_FUSED`` takes the original fused location coordinates and replicates each pair forward in time as long as the phone was sensing data as indicated by ``phone_sensed_bins`` (see :ref:`Phone valid sensed days <phone-valid-sensed-days>`), this is done because Google's API only logs a new location coordinate pair when it is sufficiently different in time or space from the previous one.
There are two parameters associated with resampling fused location in the ``RESAMPLE_FUSED_LOCATION`` section of the ``config.yaml`` file. ``CONSECUTIVE_THRESHOLD`` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair (for example, participant A's phone did not collect data between 10.30am and 10:50am and between 11:05am and 11:40am, the last known coordinate pair will be replicated during the first period but not the second, in other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes). ``TIME_SINCE_VALID_LOCATION`` (in minutes, default 720 or 12 hours) the last known fused location won't be carried over longer that this threshold even if the phone was sensing data continuously (for example, participant A went home at 9pm and their phone was sensing data without gaps until 11am the next morning, the last known location will only be replicated until 9am). If you have suggestions to modify or improve this imputation, let us know. There are two parameters associated with resampling fused location in the ``LOCATIONS`` section of the ``config.yaml`` file. ``RESAMPLE_FUSED_CONSECUTIVE_THRESHOLD`` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair (for example, participant A's phone did not collect data between 10.30am and 10:50am and between 11:05am and 11:40am, the last known coordinate pair will be replicated during the first period but not the second, in other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes). ``RESAMPLE_FUSED_TIME_SINCE_VALID_LOCATION`` (in minutes, default 720 or 12 hours) makes that the last known fused location won't be carried over longer that this threshold even if the phone was sensing data continuously (for example, participant A went home at 9pm and their phone was sensing data without gaps until 11am the next morning, the last known location will only be replicated until 9am). If you have suggestions to modify or improve this imputation, let us know.
*Significant Locations Identified* *Significant Locations Identified*

View File

@ -26,23 +26,13 @@ def optional_phone_sensed_bins_input(wildcards):
return expand("data/raw/{{pid}}/{table}_with_datetime.csv", table = tables_platform) return expand("data/raw/{{pid}}/{table}_with_datetime.csv", table = tables_platform)
def find_day_segments_input_file(wildcards):
for key, values in config.items():
if "DB_TABLE" in config[key] and config[key]["DB_TABLE"] == wildcards.sensor:
if "DAY_SEGMENTS" in config[key]:
return config[key]["DAY_SEGMENTS"]["FILE"]
else:
raise ValueError("{} should have a [DAY_SEGMENTS][FILE] parameter containing the path to its day segments file".format(wildcards.sensor))
def find_day_segments_input_type(wildcards):
for key, values in config.items():
if "DB_TABLE" in config[key] and config[key]["DB_TABLE"] == wildcards.sensor:
if "DAY_SEGMENTS" in config[key]:
return config[key]["DAY_SEGMENTS"]["TYPE"]
else:
raise ValueError("{} should have a [DAY_SEGMENTS][TYPE] parameter containing INTERVAL, FREQUENCY, or EVENT".format(wildcards.sensor))
# Features.smk ######################################################################################################### # Features.smk #########################################################################################################
def find_features_files(wildcards):
feature_files = []
for provider_key, provider in config[(wildcards.sensor_key).upper()]["PROVIDERS"].items():
if provider["COMPUTE"]:
feature_files.extend(expand("data/interim/{{pid}}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", sensor_key=(wildcards.sensor_key).lower(), language=provider["SRC_LANGUAGE"].lower(), provider_key=provider_key))
return(feature_files)
def optional_ar_input(wildcards): def optional_ar_input(wildcards):
platform = infer_participant_platform("data/external/"+wildcards.pid) platform = infer_participant_platform("data/external/"+wildcards.pid)
@ -62,18 +52,6 @@ def optional_conversation_input(wildcards):
elif platform == "ios": elif platform == "ios":
return ["data/raw/{pid}/" + config["CONVERSATION"]["DB_TABLE"]["IOS"] + "_with_datetime_unified.csv"] return ["data/raw/{pid}/" + config["CONVERSATION"]["DB_TABLE"]["IOS"] + "_with_datetime_unified.csv"]
def optional_location_barnett_input(wildcards):
if config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED":
return expand("data/raw/{{pid}}/{sensor}_resampled.csv", sensor=config["BARNETT_LOCATION"]["DB_TABLE"])
else:
return expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BARNETT_LOCATION"]["DB_TABLE"])
def optional_location_doryab_input(wildcards):
if config["DORYAB_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED":
return expand("data/raw/{{pid}}/{sensor}_resampled.csv", sensor=config["DORYAB_LOCATION"]["DB_TABLE"])
else:
return expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["DORYAB_LOCATION"]["DB_TABLE"])
def optional_steps_sleep_input(wildcards): def optional_steps_sleep_input(wildcards):
if config["STEP"]["EXCLUDE_SLEEP"]["EXCLUDE"] == True and config["STEP"]["EXCLUDE_SLEEP"]["TYPE"] == "FITBIT_BASED": if config["STEP"]["EXCLUDE_SLEEP"]["EXCLUDE"] == True and config["STEP"]["EXCLUDE_SLEEP"]["TYPE"] == "FITBIT_BASED":
return "data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv" return "data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv"

View File

@ -1,3 +1,11 @@
rule join_features_from_providers:
input:
location_features = find_features_files
output:
"data/processed/features/{pid}/{sensor_key}.csv"
script:
"../src/features/join_features_from_providers.R"
rule messages_features: rule messages_features:
input: input:
expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["MESSAGES"]["DB_TABLE"]), expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["MESSAGES"]["DB_TABLE"]),
@ -54,37 +62,29 @@ rule ios_activity_recognition_deltas:
script: script:
"../src/features/activity_recognition_deltas.R" "../src/features/activity_recognition_deltas.R"
rule location_barnett_features: rule locations_python_features:
input: input:
locations = optional_location_barnett_input location_data = expand("data/raw/{{pid}}/{sensor}_processed_{locations_to_use}.csv", sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"]),
day_segments_labels = "data/interim/day_segments_labels.csv"
params: params:
features = config["BARNETT_LOCATION"]["FEATURES"], provider = lambda wildcards: config["LOCATIONS"]["PROVIDERS"][wildcards.provider_key],
locations_to_use = config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"], provider_key = "{provider_key}",
accuracy_limit = config["BARNETT_LOCATION"]["ACCURACY_LIMIT"],
timezone = config["BARNETT_LOCATION"]["TIMEZONE"],
minutes_data_used = config["BARNETT_LOCATION"]["MINUTES_DATA_USED"],
day_segment = "{day_segment}"
output: output:
"data/processed/{pid}/location_barnett_{day_segment}.csv" "data/interim/{pid}/locations_features/locations_python_{provider_key}.csv"
script: script:
"../src/features/location_barnett_features.R" "../src/features/location/locations_entry.py"
rule location_doryab_features: rule locations_r_features:
input: input:
locations = optional_location_doryab_input location_data = expand("data/raw/{{pid}}/{sensor}_processed_{locations_to_use}.csv", sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"]),
day_segments_labels = "data/interim/day_segments_labels.csv"
params: params:
features = config["DORYAB_LOCATION"]["FEATURES"], provider = lambda wildcards: config["LOCATIONS"]["PROVIDERS"][wildcards.provider_key],
day_segment = "{day_segment}", provider_key = "{provider_key}"
dbscan_eps = config["DORYAB_LOCATION"]["DBSCAN_EPS"],
dbscan_minsamples = config["DORYAB_LOCATION"]["DBSCAN_MINSAMPLES"],
threshold_static = config["DORYAB_LOCATION"]["THRESHOLD_STATIC"],
maximum_gap_allowed = config["DORYAB_LOCATION"]["MAXIMUM_GAP_ALLOWED"],
minutes_data_used = config["DORYAB_LOCATION"]["MINUTES_DATA_USED"],
sampling_frequency = config["DORYAB_LOCATION"]["SAMPLING_FREQUENCY"]
output: output:
"data/processed/{pid}/location_doryab_{day_segment}.csv" "data/interim/{pid}/locations_features/locations_r_{provider_key}.csv"
script: script:
"../src/features/location_doryab_features.py" "../src/features/location/locations_entry.R"
rule bluetooth_features: rule bluetooth_features:
input: input:

View File

@ -40,17 +40,17 @@ rule download_dataset:
rule compute_day_segments: rule compute_day_segments:
input: input:
find_day_segments_input_file config["DAY_SEGMENTS"]["FILE"]
params: params:
day_segments_type = find_day_segments_input_type day_segments_type = config["DAY_SEGMENTS"]["TYPE"]
output: output:
segments_file = "data/interim/{sensor}_day_segments.csv", segments_file = "data/interim/day_segments.csv",
segments_labels_file = "data/interim/{sensor}_day_segments_labels.csv", segments_labels_file = "data/interim/day_segments_labels.csv",
script: script:
"../src/data/compute_day_segments.py" "../src/data/compute_day_segments.py"
PHONE_SENSORS = [] PHONE_SENSORS = []
PHONE_SENSORS.extend([config["MESSAGES"]["DB_TABLE"], config["CALLS"]["DB_TABLE"], config["BARNETT_LOCATION"]["DB_TABLE"], config["DORYAB_LOCATION"]["DB_TABLE"], config["BLUETOOTH"]["DB_TABLE"], config["BATTERY"]["DB_TABLE"], config["SCREEN"]["DB_TABLE"], config["LIGHT"]["DB_TABLE"], config["ACCELEROMETER"]["DB_TABLE"], config["APPLICATIONS_FOREGROUND"]["DB_TABLE"], config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]) PHONE_SENSORS.extend([config["MESSAGES"]["DB_TABLE"], config["CALLS"]["DB_TABLE"], config["LOCATIONS"]["DB_TABLE"], config["BLUETOOTH"]["DB_TABLE"], config["BATTERY"]["DB_TABLE"], config["SCREEN"]["DB_TABLE"], config["LIGHT"]["DB_TABLE"], config["ACCELEROMETER"]["DB_TABLE"], config["APPLICATIONS_FOREGROUND"]["DB_TABLE"], config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]])
PHONE_SENSORS.extend(config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]) PHONE_SENSORS.extend(config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"])
if len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0: if len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0:
@ -62,11 +62,11 @@ if len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) > 0:
rule readable_datetime: rule readable_datetime:
input: input:
sensor_input = "data/raw/{pid}/{sensor}_raw.csv", sensor_input = "data/raw/{pid}/{sensor}_raw.csv",
day_segments = "data/interim/{sensor}_day_segments.csv" day_segments = "data/interim/day_segments.csv"
params: params:
timezones = None, timezones = None,
fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"], fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"],
day_segments_type = find_day_segments_input_type day_segments_type = config["DAY_SEGMENTS"]["TYPE"]
wildcard_constraints: wildcard_constraints:
sensor = '.*(' + '|'.join([re.escape(x) for x in PHONE_SENSORS]) + ').*' # only process smartphone sensors, not fitbit sensor = '.*(' + '|'.join([re.escape(x) for x in PHONE_SENSORS]) + ').*' # only process smartphone sensors, not fitbit
output: output:
@ -108,19 +108,22 @@ rule unify_ios_android:
script: script:
"../src/data/unify_ios_android.R" "../src/data/unify_ios_android.R"
rule resample_fused_location: rule process_location_types:
input: input:
locations = "data/raw/{pid}/{sensor}_raw.csv", locations = "data/raw/{pid}/{sensor}_with_datetime.csv",
phone_sensed_bins = rules.phone_sensed_bins.output phone_sensed_bins = rules.phone_sensed_bins.output,
day_segments = "data/interim/day_segments.csv"
params: params:
bin_size = config["PHONE_VALID_SENSED_BINS"]["BIN_SIZE"], bin_size = config["PHONE_VALID_SENSED_BINS"]["BIN_SIZE"],
timezone = config["RESAMPLE_FUSED_LOCATION"]["TIMEZONE"], timezone = config["LOCATIONS"]["TIMEZONE"],
consecutive_threshold = config["RESAMPLE_FUSED_LOCATION"]["CONSECUTIVE_THRESHOLD"], consecutive_threshold = config["LOCATIONS"]["FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD"],
time_since_valid_location = config["RESAMPLE_FUSED_LOCATION"]["TIME_SINCE_VALID_LOCATION"] time_since_valid_location = config["LOCATIONS"]["FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION"],
day_segments_type = config["DAY_SEGMENTS"]["TYPE"],
locations_to_use = "{locations_to_used}"
output: output:
"data/raw/{pid}/{sensor}_resampled.csv" "data/raw/{pid}/{sensor}_processed_{locations_to_used}.csv"
script: script:
"../src/data/resample_fused_location.R" "../src/data/process_location_types.R"
rule application_genres: rule application_genres:
input: input:

View File

@ -0,0 +1,85 @@
library("tidyverse")
library("lubridate")
assign_to_day_segment <- function(data, day_segments, day_segments_type, fixed_timezone){
if(day_segments_type == "FREQUENCY_EVERY_DAY"){
data <- data %>% mutate(local_date_time_obj = lubridate::parse_date_time(local_time, orders = c("HMS", "HM")))
day_segments <- day_segments %>% mutate(start_time = lubridate::parse_date_time(start_time, orders = c("HMS", "HM")),
end_time = start_time + minutes(length))
# Create a new column for each day_segment
for(row_id in 1:nrow(day_segments)){
row = day_segments[row_id,]
data <- data %>% mutate(!!paste("local_day_segment", row_id, sep = "_") := ifelse(local_date_time_obj >= row$start_time & local_date_time_obj < row$end_time,
paste0("[",
row$label, "_",
local_date, "_",
paste(str_pad(hour(row$start_time),2, pad="0"), str_pad(minute(row$start_time),2, pad="0"), str_pad(second(row$start_time),2, pad="0"),sep =":"),
"]"), NA))
}
} else if (day_segments_type == "INTERVAL_EVERY_DAY"){
data_dates <- data %>% select(local_date) %>% distinct(local_date)
inferred_day_segments <- crossing(day_segments, data_dates) %>%
mutate(start_local_date_time_obj = lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = fixed_timezone),
end_local_date_time_obj = start_local_date_time_obj + lubridate::period(length),
date_time_interval = lubridate::interval(start_local_date_time_obj, end_local_date_time_obj)) %>%
group_by(label, local_date) %>%
mutate(group_start_datetime = lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = fixed_timezone),
group_end_datetime = group_start_datetime + lubridate::period(length),
group_start_datetime = min(group_start_datetime),
group_end_datetime = max(group_end_datetime)) %>%
ungroup()
data <- data %>% mutate(local_date_time_obj = lubridate::ymd_hms(local_date_time, tz = fixed_timezone))
# Create a new column for each day_segment
for(row_id in 1:nrow(inferred_day_segments)){
row = inferred_day_segments[row_id,]
data <- data %>% mutate(!!paste("local_day_segment", row_id, sep = "_") := ifelse(local_date_time_obj %within% row$date_time_interval,
paste0("[",
paste(sep= "#",
row$label,
lubridate::date(row$group_start_datetime),
paste(str_pad(hour(row$group_start_datetime),2, pad="0"), str_pad(minute(row$group_start_datetime),2, pad="0"), str_pad(second(row$group_start_datetime),2, pad="0"),sep =":"),
lubridate::date(row$group_end_datetime),
paste(str_pad(hour(row$group_end_datetime),2, pad="0"), str_pad(minute(row$group_end_datetime),2, pad="0"), str_pad(second(row$group_end_datetime),2, pad="0"),sep =":")
),
"]"), NA))
}
} else if ( day_segments_type == "INTERVAL_FLEXIBLE_DAY"){
data <- data %>% mutate(local_date_time_obj = lubridate::ymd_hms(local_date_time, tz = fixed_timezone))
day_segments <- day_segments %>% mutate(shift = ifelse(shift == "0", "0seconds", shift),
start_local_date_time_obj = lubridate::ymd_hms(start_date_time, tz = fixed_timezone) + (lubridate::period(shift) * ifelse(shift_direction >= 0, 1, -1)),
end_local_date_time_obj = start_local_date_time_obj + lubridate::period(length),
date_time_interval = lubridate::interval(start_local_date_time_obj, end_local_date_time_obj))
# Create a new column for each day_segment
for(row_id in 1:nrow(day_segments)){
row = day_segments[row_id,]
print(row$length)
data <- data %>% mutate(!!paste("local_day_segment", row_id, sep = "_") := ifelse(local_date_time_obj %within% row$date_time_interval,
paste0("[",
paste(sep= "#",
row$label,
lubridate::date(row$start_local_date_time_obj),
paste(str_pad(hour(row$start_local_date_time_obj),2, pad="0"), str_pad(minute(row$start_local_date_time_obj),2, pad="0"), str_pad(second(row$start_local_date_time_obj),2, pad="0"),sep =":"),
lubridate::date(row$end_local_date_time_obj),
paste(str_pad(hour(row$end_local_date_time_obj),2, pad="0"), str_pad(minute(row$end_local_date_time_obj),2, pad="0"), str_pad(second(row$end_local_date_time_obj),2, pad="0"),sep =":")
),
"]"), NA))
}
}
# Join all day_segments in a single column
data <- data %>%
unite("assigned_segments", starts_with("local_day_segment"), sep = "|", na.rm = TRUE) %>%
select(-local_date_time_obj)
return(data)
}

View File

@ -0,0 +1,76 @@
source("renv/activate.R")
library(dplyr)
library(readr)
library(tidyr)
source("src/data/assign_to_day_segment.R")
bin_size <- snakemake@params[["bin_size"]]
timezone <- snakemake@params[["timezone"]]
consecutive_threshold <- snakemake@params[["consecutive_threshold"]]
time_since_valid_location <- snakemake@params[["time_since_valid_location"]]
location_to_used <- snakemake@params[["time_since_valocation_to_usedlid_location"]]
day_segments <- read.csv(snakemake@input[["day_segments"]])
day_segments_type <- snakemake@params[["day_segments_type"]]
phone_sensed_bins <- read_csv(snakemake@input[["phone_sensed_bins"]], col_types = cols(local_date = col_character()))
locations <- read_csv(snakemake@input[["locations"]], col_types = cols()) %>% filter(provider == "fused") %>%
filter(double_latitude != 0 & double_longitude != 0)
locations_to_use <- snakemake@params["locations_to_use"]
if(!locations_to_use %in% c("ALL", "FUSED_RESAMPLED", "GPS")){
print("Unkown location filter, provide one of the following three: ALL, GPS, or FUSED_RESAMPLED")
quit(save = "no", status = 1, runLast = FALSE)
}
if(locations_to_use == "ALL"){
processed_locations <- locations
} else if(locations_to_use == "GPS"){
processed_locations <- locations %>% filter(provider == "gps")
} else if(locations_to_use == "FUSED_RESAMPLED"){
locations <- locations %>% filter(provider == "fused")
if(nrow(locations) > 0){
sensed_minute_bins <- phone_sensed_bins %>%
pivot_longer(-local_date, names_to = c("hour", "bin"), names_sep = "_", values_to = "sensor_count") %>%
mutate(hour = as.integer(hour), bin = as.integer(bin)) %>%
complete(nesting(local_date, hour), bin = seq(0, 59,1)) %>%
fill(sensor_count) %>%
mutate(timestamp = as.numeric(as.POSIXct(paste0(local_date, " ", hour,":", bin,":00"), format = "%Y-%m-%d %H:%M:%S", tz = timezone)) * 1000 ) %>%
filter(sensor_count > 0) %>%
select(timestamp)
resampled_locations <- locations %>%
select(-assigned_segments) %>%
bind_rows(sensed_minute_bins) %>%
mutate(provider = replace_na(provider, "resampled")) %>%
arrange(timestamp) %>%
# We group and therefore, fill in, missing rows that appear after a valid fused location record and exist
# within consecutive_threshold minutes from each other
mutate(consecutive_time_diff = c(1, diff(timestamp)),
resample_group = cumsum(!is.na(double_longitude) | consecutive_time_diff > (1000 * 60 * consecutive_threshold))) %>%
group_by(resample_group) %>%
# drop rows that are logged after time_since_valid_location minutes from the last valid fused location
filter((timestamp - first(timestamp) < (1000 * 60 * time_since_valid_location))) %>%
fill(-timestamp, -resample_group) %>%
select(-consecutive_time_diff) %>%
drop_na(double_longitude, double_latitude, accuracy) %>%
# Add local date_time
mutate(utc_date_time = as.POSIXct(timestamp/1000, origin="1970-01-01", tz="UTC"),
local_date_time = format(utc_date_time, tz = timezone, usetz = F)) %>%
separate(local_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>%
separate(local_time, c("local_hour", "local_minute"), ":", remove = FALSE, extra = "drop") %>%
mutate(local_hour = as.numeric(local_hour),
local_minute = as.numeric(local_minute)) %>%
# Delete resampled rows that exist in the same minute as other original (fused) rows
group_by(local_date, local_hour, local_minute) %>%
mutate(n = n()) %>%
filter(n == 1 | (n > 1 & provider == "fused")) %>%
select(-n) %>%
ungroup()
processed_locations <- assign_to_day_segment(resampled_locations, day_segments, day_segments_type, timezone)
} else {
processed_locations <- locations
}
}
write.csv(processed_locations,snakemake@output[[1]], row.names = F)

View File

@ -1,8 +1,8 @@
source("renv/activate.R") source("renv/activate.R")
library("tidyverse") library("tidyverse")
library("readr") library("readr")
library("lubridate")
source("src/data/assign_to_day_segment.R")
input <- read.csv(snakemake@input[["sensor_input"]]) %>% arrange(timestamp) input <- read.csv(snakemake@input[["sensor_input"]]) %>% arrange(timestamp)
day_segments <- read.csv(snakemake@input[["day_segments"]]) day_segments <- read.csv(snakemake@input[["day_segments"]])
@ -11,89 +11,6 @@ sensor_output <- snakemake@output[[1]]
timezone_periods <- snakemake@params[["timezone_periods"]] timezone_periods <- snakemake@params[["timezone_periods"]]
fixed_timezone <- snakemake@params[["fixed_timezone"]] fixed_timezone <- snakemake@params[["fixed_timezone"]]
assign_to_day_segment <- function(data, day_segments, day_segments_type, fixed_timezone){
if(day_segments_type == "FREQUENCY_EVERY_DAY"){
data <- data %>% mutate(local_date_time_obj = lubridate::parse_date_time(local_time, orders = c("HMS", "HM")))
day_segments <- day_segments %>% mutate(start_time = lubridate::parse_date_time(start_time, orders = c("HMS", "HM")),
end_time = start_time + minutes(length))
# Create a new column for each day_segment
for(row_id in 1:nrow(day_segments)){
row = day_segments[row_id,]
data <- data %>% mutate(!!paste("local_day_segment", row_id, sep = "_") := ifelse(local_date_time_obj >= row$start_time & local_date_time_obj < row$end_time,
paste0("[",
row$label, "_",
local_date, "_",
paste(str_pad(hour(row$start_time),2, pad="0"), str_pad(minute(row$start_time),2, pad="0"), str_pad(second(row$start_time),2, pad="0"),sep =":"),
"]"), NA))
}
} else if (day_segments_type == "INTERVAL_EVERY_DAY"){
data_dates <- data %>% select(local_date) %>% distinct(local_date)
inferred_day_segments <- crossing(day_segments, data_dates) %>%
mutate(start_local_date_time_obj = lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = fixed_timezone),
end_local_date_time_obj = start_local_date_time_obj + lubridate::period(length),
date_time_interval = lubridate::interval(start_local_date_time_obj, end_local_date_time_obj)) %>%
group_by(label, local_date) %>%
mutate(group_start_datetime = lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = fixed_timezone),
group_end_datetime = group_start_datetime + lubridate::period(length),
group_start_datetime = min(group_start_datetime),
group_end_datetime = max(group_end_datetime)) %>%
ungroup()
data <- data %>% mutate(local_date_time_obj = lubridate::ymd_hms(local_date_time, tz = fixed_timezone))
# Create a new column for each day_segment
for(row_id in 1:nrow(inferred_day_segments)){
row = inferred_day_segments[row_id,]
data <- data %>% mutate(!!paste("local_day_segment", row_id, sep = "_") := ifelse(local_date_time_obj %within% row$date_time_interval,
paste0("[",
paste(sep= "#",
row$label,
lubridate::date(row$group_start_datetime),
paste(str_pad(hour(row$group_start_datetime),2, pad="0"), str_pad(minute(row$group_start_datetime),2, pad="0"), str_pad(second(row$group_start_datetime),2, pad="0"),sep =":"),
lubridate::date(row$group_end_datetime),
paste(str_pad(hour(row$group_end_datetime),2, pad="0"), str_pad(minute(row$group_end_datetime),2, pad="0"), str_pad(second(row$group_end_datetime),2, pad="0"),sep =":")
),
"]"), NA))
}
} else if ( day_segments_type == "INTERVAL_FLEXIBLE_DAY"){
data <- data %>% mutate(local_date_time_obj = lubridate::ymd_hms(local_date_time, tz = fixed_timezone))
day_segments <- day_segments %>% mutate(shift = ifelse(shift == "0", "0seconds", shift),
start_local_date_time_obj = lubridate::ymd_hms(start_date_time, tz = fixed_timezone) + (lubridate::period(shift) * ifelse(shift_direction >= 0, 1, -1)),
end_local_date_time_obj = start_local_date_time_obj + lubridate::period(length),
date_time_interval = lubridate::interval(start_local_date_time_obj, end_local_date_time_obj))
# Create a new column for each day_segment
for(row_id in 1:nrow(day_segments)){
row = day_segments[row_id,]
print(row$length)
data <- data %>% mutate(!!paste("local_day_segment", row_id, sep = "_") := ifelse(local_date_time_obj %within% row$date_time_interval,
paste0("[",
paste(sep= "#",
row$label,
lubridate::date(row$start_local_date_time_obj),
paste(str_pad(hour(row$start_local_date_time_obj),2, pad="0"), str_pad(minute(row$start_local_date_time_obj),2, pad="0"), str_pad(second(row$start_local_date_time_obj),2, pad="0"),sep =":"),
lubridate::date(row$end_local_date_time_obj),
paste(str_pad(hour(row$end_local_date_time_obj),2, pad="0"), str_pad(minute(row$end_local_date_time_obj),2, pad="0"), str_pad(second(row$end_local_date_time_obj),2, pad="0"),sep =":")
),
"]"), NA))
}
}
# Join all day_segments in a single column
data <- data %>%
unite("assigned_segments", starts_with("local_day_segment"), sep = "|", na.rm = TRUE) %>%
select(-local_date_time_obj)
return(data)
}
split_local_date_time <- function(data, day_segments){ split_local_date_time <- function(data, day_segments){
split_data <- data %>% split_data <- data %>%
separate(local_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>% separate(local_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>%

View File

@ -1,59 +0,0 @@
source("renv/activate.R")
library(dplyr)
library(readr)
library(tidyr)
bin_size <- snakemake@params[["bin_size"]]
timezone <- snakemake@params[["timezone"]]
consecutive_threshold <- snakemake@params[["consecutive_threshold"]]
time_since_valid_location <- snakemake@params[["time_since_valid_location"]]
locations <- read_csv(snakemake@input[["locations"]], col_types = cols()) %>% filter(provider == "fused")
phone_sensed_bins <- read_csv(snakemake@input[["phone_sensed_bins"]], col_types = cols(local_date = col_character()))
if(nrow(locations) > 0){
sensed_minute_bins <- phone_sensed_bins %>%
pivot_longer(-local_date, names_to = c("hour", "bin"), names_sep = "_", values_to = "sensor_count") %>%
mutate(hour = as.integer(hour), bin = as.integer(bin)) %>%
complete(nesting(local_date, hour), bin = seq(0, 59,1)) %>%
fill(sensor_count) %>%
mutate(timestamp = as.numeric(as.POSIXct(paste0(local_date, " ", hour,":", bin,":00"), format = "%Y-%m-%d %H:%M:%S", tz = timezone)) * 1000 ) %>%
filter(sensor_count > 0) %>%
select(timestamp)
resampled_locations <- locations %>%
bind_rows(sensed_minute_bins) %>%
mutate(provider = replace_na(provider, "resampled")) %>%
arrange(timestamp) %>%
# We group and therefore, fill in, missing rows that appear after a valid fused location record and exist
# within consecutive_threshold minutes from each other
mutate(consecutive_time_diff = c(1, diff(timestamp)),
resample_group = cumsum(!is.na(double_longitude) | consecutive_time_diff > (1000 * 60 * consecutive_threshold))) %>%
group_by(resample_group) %>%
# drop rows that are logged after time_since_valid_location minutes from the last valid fused location
filter((timestamp - first(timestamp) < (1000 * 60 * time_since_valid_location))) %>%
fill(-timestamp, -resample_group) %>%
select(-consecutive_time_diff) %>%
drop_na(double_longitude, double_latitude, accuracy) %>%
# Add local date_time
mutate(utc_date_time = as.POSIXct(timestamp/1000, origin="1970-01-01", tz="UTC"),
local_date_time = format(utc_date_time, tz = timezone, usetz = F)) %>%
separate(local_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>%
separate(local_time, c("local_hour", "local_minute"), ":", remove = FALSE, extra = "drop") %>%
mutate(local_hour = as.numeric(local_hour),
local_minute = as.numeric(local_minute),
local_day_segment = case_when(local_hour %in% 0:5 ~ "night",
local_hour %in% 6:11 ~ "morning",
local_hour %in% 12:17 ~ "afternoon",
local_hour %in% 18:23 ~ "evening")) %>%
# Delete resampled rows that exist in the same minute as other original (fused) rows
group_by(local_date, local_hour, local_minute) %>%
mutate(n = n()) %>%
filter(n == 1 | (n > 1 & provider == "fused")) %>%
select(-n)
write.csv(resampled_locations,snakemake@output[[1]], row.names = F)
} else {
write.csv(locations,snakemake@output[[1]], row.names = F)
}

View File

@ -0,0 +1,14 @@
source("renv/activate.R")
library("tidyr")
library("dplyr")
location_features_files <- snakemake@input[["location_features"]]
location_features <- setNames(data.frame(matrix(ncol = 1, nrow = 0)), c("local_segment"))
for(location_features_file in location_features_files){
location_features <- merge(location_features, read.csv(location_features_file), all = TRUE)
}
write.csv(location_features, snakemake@output[[1]], row.names = FALSE)

View File

@ -0,0 +1,96 @@
source("renv/activate.R")
library("dplyr")
library("stringr")
# Load Ian Barnett's code. Taken from https://scholar.harvard.edu/ibarnett/software/gpsmobility
file.sources = list.files(c("src/features/location/barnett/library"), pattern="*.R$", full.names=TRUE, ignore.case=TRUE)
sapply(file.sources,source,.GlobalEnv)
create_empty_file <- function(requested_features){
return(data.frame(local_segment= character(),
locations_barnett_hometime= numeric(),
locations_barnett_disttravelled= numeric(),
locations_barnett_rog= numeric(),
locations_barnett_maxdiam= numeric(),
locations_barnett_maxhomedist= numeric(),
locations_barnett_siglocsvisited= numeric(),
locations_barnett_avgflightlen= numeric(),
locations_barnett_stdflightlen= numeric(),
locations_barnett_avgflightdur= numeric(),
locations_barnett_stdflightdur= numeric(),
locations_barnett_probpause= numeric(),
locations_barnett_siglocentropy= numeric(),
locations_barnett_minsmissing= numeric(),
locations_barnett_circdnrtn= numeric(),
locations_barnett_wkenddayrtn= numeric(),
locations_barnett_minutes_data_used= numeric()
) %>% select(all_of(requested_features)))
}
barnett_location_features <- function(location_data, day_segment, params){
location_features <- NULL
location <- location_data
accuracy_limit <- params[["ACCURACY_LIMIT"]]
timezone <- params[["TIMEZONE"]]
minutes_data_used <- params[["MINUTES_DATA_USED"]]
# Compute what features were requested
available_features <- c("hometime","disttravelled","rog","maxdiam", "maxhomedist","siglocsvisited","avgflightlen", "stdflightlen",
"avgflightdur","stdflightdur", "probpause","siglocentropy","minsmissing", "circdnrtn","wkenddayrtn")
requested_features <- intersect(unlist(params["FEATURES"], use.names = F), available_features)
requested_features <- c("local_segment", paste("locations_barnett", requested_features, sep = "_"))
if(minutes_data_used)
requested_features <- c(requested_features, "locations_barnett_minutes_data_used")
# Excludes datasets with less than 24 hours of data
if(max(location$timestamp) - min(location$timestamp) < 86400000)
location <- head(location, 0)
if (nrow(location) > 1){
# Filter by segment and skipping any non-daily segment
location <- location %>% filter_data_by_segment(day_segment)
segment <- location %>% head(1) %>% pull(local_segment)
segment_data <- str_split(segment, "#")[[1]]
if(segment_data[[2]] != segment_data[[4]] || segment_data[[3]] != "00:00:00" || segment_data[[5]] != "23:59:59"){
warning(paste("Barnett's location features cannot be computed for day segmentes that are not daily (cover 00:00:00 to 23:59:59 of every day). Skipping for ", segment))
location_features <- create_empty_file(requested_features)
} else {
# Count how many minutes of data we use to get location features
# Some minutes have multiple fused rows
location_minutes_used <- location %>%
group_by(local_date, local_hour) %>%
summarise(n_minutes = n_distinct(local_minute)) %>%
group_by(local_date) %>%
summarise(locations_barnett_minutes_data_used = sum(n_minutes)) %>%
select(local_date, locations_barnett_minutes_data_used)
# Save day segment to attach it later
location_dates_segments <- location %>% select(local_date, local_segment) %>% distinct(local_date, .keep_all = TRUE)
# Select only the columns that the algorithm needs
location <- location %>% select(timestamp, latitude = double_latitude, longitude = double_longitude, altitude = double_altitude, accuracy)
outputMobility <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone)
if(is.null(outputMobility)){
location_features <- create_empty_file(requested_features)
} else{
# Copy index (dates) as a column
features <- cbind(rownames(outputMobility$featavg), outputMobility$featavg)
features <- as.data.frame(features)
features[-1] <- lapply(lapply(features[-1], as.character), as.numeric)
colnames(features)=c("local_date",tolower(paste("locations_barnett", colnames(outputMobility$featavg), sep = "_")))
# Add the minute count column
features <- left_join(features, location_minutes_used, by = "local_date")
# Add the day segment column for consistency
features <- left_join(features, location_dates_segments, by = "local_date")
location_features <- features %>% select(all_of(requested_features))
}
}
} else {
location_features <- create_empty_file(requested_features)
}
if(ncol(location_features) != length(requested_features))
stop(paste0("The number of features in the output dataframe (=", ncol(location_features),") does not match the expected value (=", length(requested_features),"). Verify your barnett location features"))
return(location_features)
}

View File

@ -4,21 +4,32 @@ from astropy.timeseries import LombScargle
from sklearn.cluster import DBSCAN from sklearn.cluster import DBSCAN
from math import radians, cos, sin, asin, sqrt from math import radians, cos, sin, asin, sqrt
def base_location_features(location_data, day_segment, requested_features, dbscan_eps, dbscan_minsamples, threshold_static, maximum_gap_allowed,sampling_frequency): def doryab_location_features(location_data, day_segment, params, filter_data_by_segment, *args, **kwargs):
requested_features = params["FEATURES"]
dbscan_eps = params["DBSCAN_EPS"]
dbscan_minsamples = params["DBSCAN_MINSAMPLES"]
threshold_static = params["THRESHOLD_STATIC"]
maximum_gap_allowed = params["MAXIMUM_GAP_ALLOWED"]
sampling_frequency = params["SAMPLING_FREQUENCY"]
minutes_data_used = params["MINUTES_DATA_USED"]
if(minutes_data_used):
requested_features.append("minutesdataused")
# name of the features this function can compute # name of the features this function can compute
base_features_names = ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","minutesdataused"] base_features_names = ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","minutesdataused"]
# the subset of requested features this function can compute # the subset of requested features this function can compute
features_to_compute = list(set(requested_features) & set(base_features_names)) features_to_compute = list(set(requested_features) & set(base_features_names))
if location_data.empty:
location_features = pd.DataFrame(columns=["local_date"] + ["location_" + day_segment + "_" + x for x in features_to_compute])
else:
if day_segment != "daily":
location_data = location_data[location_data["local_day_segment"] == day_segment]
if location_data.empty: if location_data.empty:
location_features = pd.DataFrame(columns=["local_date"] + ["location_" + day_segment + "_" + x for x in features_to_compute]) location_features = pd.DataFrame(columns=["local_segment"] + ["locations_doryab_" + x for x in features_to_compute])
else:
location_data = filter_data_by_segment(location_data, day_segment)
if location_data.empty:
location_features = pd.DataFrame(columns=["local_segment"] + ["locations_doryab_" + x for x in features_to_compute])
else: else:
location_features = pd.DataFrame() location_features = pd.DataFrame()
@ -26,108 +37,108 @@ def base_location_features(location_data, day_segment, requested_features, dbsca
sampling_frequency = getSamplingFrequency(location_data) sampling_frequency = getSamplingFrequency(location_data)
if "minutesdataused" in features_to_compute: if "minutesdataused" in features_to_compute:
for localDate in location_data["local_date"].unique(): for localDate in location_data["local_segment"].unique():
location_features.loc[localDate,"location_" + day_segment + "_minutesdataused"] = getMinutesData(location_data[location_data["local_date"]==localDate]) location_features.loc[localDate,"locations_doryab_minutesdataused"] = getMinutesData(location_data[location_data["local_segment"]==localDate])
location_features.index.name = 'local_date' location_features.index.name = 'local_segment'
location_data = location_data[(location_data['double_latitude']!=0.0) & (location_data['double_longitude']!=0.0)] location_data = location_data[(location_data['double_latitude']!=0.0) & (location_data['double_longitude']!=0.0)]
if "locationvariance" in features_to_compute: if "locationvariance" in features_to_compute:
location_features["location_" + day_segment + "_locationvariance"] = location_data.groupby(['local_date'])['double_latitude'].var() + location_data.groupby(['local_date'])['double_longitude'].var() location_features["locations_doryab_locationvariance"] = location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var()
if "loglocationvariance" in features_to_compute: if "loglocationvariance" in features_to_compute:
location_features["location_" + day_segment + "_loglocationvariance"] = (location_data.groupby(['local_date'])['double_latitude'].var() + location_data.groupby(['local_date'])['double_longitude'].var()).apply(lambda x: np.log10(x) if x > 0 else None) location_features["locations_doryab_loglocationvariance"] = (location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var()).apply(lambda x: np.log10(x) if x > 0 else None)
preComputedDistanceandSpeed = pd.DataFrame() preComputedDistanceandSpeed = pd.DataFrame()
for localDate in location_data['local_date'].unique(): for localDate in location_data['local_segment'].unique():
distance, speeddf = get_all_travel_distances_meters_speed(location_data[location_data['local_date']==localDate],threshold_static,maximum_gap_allowed) distance, speeddf = get_all_travel_distances_meters_speed(location_data[location_data['local_segment']==localDate],threshold_static,maximum_gap_allowed)
preComputedDistanceandSpeed.loc[localDate,"distance"] = distance.sum() preComputedDistanceandSpeed.loc[localDate,"distance"] = distance.sum()
preComputedDistanceandSpeed.loc[localDate,"avgspeed"] = speeddf[speeddf['speedTag'] == 'Moving']['speed'].mean() preComputedDistanceandSpeed.loc[localDate,"avgspeed"] = speeddf[speeddf['speedTag'] == 'Moving']['speed'].mean()
preComputedDistanceandSpeed.loc[localDate,"varspeed"] = speeddf[speeddf['speedTag'] == 'Moving']['speed'].var() preComputedDistanceandSpeed.loc[localDate,"varspeed"] = speeddf[speeddf['speedTag'] == 'Moving']['speed'].var()
if "totaldistance" in features_to_compute: if "totaldistance" in features_to_compute:
for localDate in location_data['local_date'].unique(): for localDate in location_data['local_segment'].unique():
location_features.loc[localDate,"location_" + day_segment + "_totaldistance"] = preComputedDistanceandSpeed.loc[localDate,"distance"] location_features.loc[localDate,"locations_doryab_totaldistance"] = preComputedDistanceandSpeed.loc[localDate,"distance"]
if "averagespeed" in features_to_compute: if "averagespeed" in features_to_compute:
for localDate in location_data['local_date'].unique(): for localDate in location_data['local_segment'].unique():
location_features.loc[localDate,"location_" + day_segment + "_averagespeed"] = preComputedDistanceandSpeed.loc[localDate,"avgspeed"] location_features.loc[localDate,"locations_doryab_averagespeed"] = preComputedDistanceandSpeed.loc[localDate,"avgspeed"]
if "varspeed" in features_to_compute: if "varspeed" in features_to_compute:
for localDate in location_data['local_date'].unique(): for localDate in location_data['local_segment'].unique():
location_features.loc[localDate,"location_" + day_segment + "_varspeed"] = preComputedDistanceandSpeed.loc[localDate,"varspeed"] location_features.loc[localDate,"locations_doryab_varspeed"] = preComputedDistanceandSpeed.loc[localDate,"varspeed"]
if "circadianmovement" in features_to_compute: if "circadianmovement" in features_to_compute:
for localDate in location_data['local_date'].unique(): for localDate in location_data['local_segment'].unique():
location_features.loc[localDate,"location_" + day_segment + "_circadianmovement"] = circadian_movement(location_data[location_data['local_date']==localDate]) location_features.loc[localDate,"locations_doryab_circadianmovement"] = circadian_movement(location_data[location_data['local_segment']==localDate])
newLocationData = cluster_and_label(location_data, eps= distance_to_degrees(dbscan_eps), min_samples=dbscan_minsamples) newLocationData = cluster_and_label(location_data, eps= distance_to_degrees(dbscan_eps), min_samples=dbscan_minsamples)
if "numberofsignificantplaces" in features_to_compute: if "numberofsignificantplaces" in features_to_compute:
for localDate in newLocationData['local_date'].unique(): for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"location_" + day_segment + "_numberofsignificantplaces"] = number_of_significant_places(newLocationData[newLocationData['local_date']==localDate]) location_features.loc[localDate,"locations_doryab_numberofsignificantplaces"] = number_of_significant_places(newLocationData[newLocationData['local_segment']==localDate])
if "numberlocationtransitions" in features_to_compute: if "numberlocationtransitions" in features_to_compute:
for localDate in newLocationData['local_date'].unique(): for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"location_" + day_segment + "_numberlocationtransitions"] = number_location_transitions(newLocationData[newLocationData['local_date']==localDate]) location_features.loc[localDate,"locations_doryab_numberlocationtransitions"] = number_location_transitions(newLocationData[newLocationData['local_segment']==localDate])
if "radiusgyration" in features_to_compute: if "radiusgyration" in features_to_compute:
for localDate in newLocationData['local_date'].unique(): for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"location_" + day_segment + "_radiusgyration"] = radius_of_gyration(newLocationData[newLocationData['local_date']==localDate],sampling_frequency) location_features.loc[localDate,"locations_doryab_radiusgyration"] = radius_of_gyration(newLocationData[newLocationData['local_segment']==localDate],sampling_frequency)
if "timeattop1location" in features_to_compute: if "timeattop1location" in features_to_compute:
for localDate in newLocationData['local_date'].unique(): for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"location_" + day_segment + "_timeattop1"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_date']==localDate],1,sampling_frequency) location_features.loc[localDate,"locations_doryab_timeattop1"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_segment']==localDate],1,sampling_frequency)
if "timeattop2location" in features_to_compute: if "timeattop2location" in features_to_compute:
for localDate in newLocationData['local_date'].unique(): for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"location_" + day_segment + "_timeattop2"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_date']==localDate],2,sampling_frequency) location_features.loc[localDate,"locations_doryab_timeattop2"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_segment']==localDate],2,sampling_frequency)
if "timeattop3location" in features_to_compute: if "timeattop3location" in features_to_compute:
for localDate in newLocationData['local_date'].unique(): for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"location_" + day_segment + "_timeattop3"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_date']==localDate],3,sampling_frequency) location_features.loc[localDate,"locations_doryab_timeattop3"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_segment']==localDate],3,sampling_frequency)
if "movingtostaticratio" in features_to_compute: if "movingtostaticratio" in features_to_compute:
for localDate in newLocationData['local_date'].unique(): for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"location_" + day_segment + "_movingtostaticratio"] = (newLocationData[newLocationData['local_date']==localDate].shape[0]*sampling_frequency) / (location_data[location_data['local_date']==localDate].shape[0] * sampling_frequency) location_features.loc[localDate,"locations_doryab_movingtostaticratio"] = (newLocationData[newLocationData['local_segment']==localDate].shape[0]*sampling_frequency) / (location_data[location_data['local_segment']==localDate].shape[0] * sampling_frequency)
if "outlierstimepercent" in features_to_compute: if "outlierstimepercent" in features_to_compute:
for localDate in newLocationData['local_date'].unique(): for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"location_" + day_segment + "_outlierstimepercent"] = outliers_time_percent(newLocationData[newLocationData['local_date']==localDate],sampling_frequency) location_features.loc[localDate,"locations_doryab_outlierstimepercent"] = outliers_time_percent(newLocationData[newLocationData['local_segment']==localDate],sampling_frequency)
preComputedmaxminCluster = pd.DataFrame() preComputedmaxminCluster = pd.DataFrame()
for localDate in newLocationData['local_date'].unique(): for localDate in newLocationData['local_segment'].unique():
smax, smin, sstd,smean = len_stay_at_clusters_in_minutes(newLocationData[newLocationData['local_date']==localDate],sampling_frequency) smax, smin, sstd,smean = len_stay_at_clusters_in_minutes(newLocationData[newLocationData['local_segment']==localDate],sampling_frequency)
preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_maxlengthstayatclusters"] = smax preComputedmaxminCluster.loc[localDate,"locations_doryab_maxlengthstayatclusters"] = smax
preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_minlengthstayatclusters"] = smin preComputedmaxminCluster.loc[localDate,"locations_doryab_minlengthstayatclusters"] = smin
preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_stdlengthstayatclusters"] = sstd preComputedmaxminCluster.loc[localDate,"locations_doryab_stdlengthstayatclusters"] = sstd
preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_meanlengthstayatclusters"] = smean preComputedmaxminCluster.loc[localDate,"locations_doryab_meanlengthstayatclusters"] = smean
if "maxlengthstayatclusters" in features_to_compute: if "maxlengthstayatclusters" in features_to_compute:
for localDate in newLocationData['local_date'].unique(): for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"location_" + day_segment + "_maxlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_maxlengthstayatclusters"] location_features.loc[localDate,"locations_doryab_maxlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"locations_doryab_maxlengthstayatclusters"]
if "minlengthstayatclusters" in features_to_compute: if "minlengthstayatclusters" in features_to_compute:
for localDate in newLocationData['local_date'].unique(): for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"location_" + day_segment + "_minlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_minlengthstayatclusters"] location_features.loc[localDate,"locations_doryab_minlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"locations_doryab_minlengthstayatclusters"]
if "stdlengthstayatclusters" in features_to_compute: if "stdlengthstayatclusters" in features_to_compute:
for localDate in newLocationData['local_date'].unique(): for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"location_" + day_segment + "_stdlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_stdlengthstayatclusters"] location_features.loc[localDate,"locations_doryab_stdlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"locations_doryab_stdlengthstayatclusters"]
if "meanlengthstayatclusters" in features_to_compute: if "meanlengthstayatclusters" in features_to_compute:
for localDate in newLocationData['local_date'].unique(): for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"location_" + day_segment + "_meanlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_meanlengthstayatclusters"] location_features.loc[localDate,"locations_doryab_meanlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"locations_doryab_meanlengthstayatclusters"]
if "locationentropy" in features_to_compute: if "locationentropy" in features_to_compute:
for localDate in newLocationData['local_date'].unique(): for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"location_" + day_segment + "_locationentropy"] = location_entropy(newLocationData[newLocationData['local_date']==localDate]) location_features.loc[localDate,"locations_doryab_locationentropy"] = location_entropy(newLocationData[newLocationData['local_segment']==localDate])
if "normalizedlocationentropy" in features_to_compute: if "normalizedlocationentropy" in features_to_compute:
for localDate in newLocationData['local_date'].unique(): for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"location_" + day_segment + "_normalizedlocationentropy"] = location_entropy_normalized(newLocationData[newLocationData['local_date']==localDate]) location_features.loc[localDate,"locations_doryab_normalizedlocationentropy"] = location_entropy_normalized(newLocationData[newLocationData['local_segment']==localDate])
location_features = location_features.reset_index() location_features = location_features.reset_index()

View File

@ -0,0 +1,44 @@
source("renv/activate.R")
source("src/features/utils/utils.R")
library("dplyr")
library("stringr")
library("tidyr")
location_data <- read.csv(snakemake@input[["location_data"]], stringsAsFactors = FALSE)
day_segments_labels <- read.csv(snakemake@input[["day_segments_labels"]], stringsAsFactors = FALSE)
provider <- snakemake@params["provider"][["provider"]]
provider_key <- snakemake@params["provider_key"]
location_features <- data.frame(local_segment = character(), stringsAsFactors = FALSE)
if(!"FEATURES" %in% names(provider))
stop(paste0("Provider config[LOCATION][PROVIDERS][", provider_key,"] is missing a FEATURES attribute in config.yaml"))
if(provider[["COMPUTE"]] == TRUE){
code_path <- paste0("src/features/location/", provider[["SRC_FOLDER"]], "/main.R")
source(code_path)
features_function <- match.fun(paste0(provider[["SRC_FOLDER"]], "_location_features"))
day_segments <- day_segments_labels %>% pull(label)
for (day_segment in day_segments){
print(paste(rapids_log_tag,"Processing", provider_key, day_segment))
features <- features_function(location_data, day_segment, provider)
# Check all features names contain the provider key so they are unique
features_names <- colnames(features %>% select(-local_segment))
if(any(!grepl(paste0(".*(",str_to_lower(provider_key),").*"), features_names)))
stop(paste("The name of all location features of", provider_key," must contain its name in lower case but the following don't [", paste(features_names[!grepl(paste0(".*(",str_to_lower(provider_key),").*"), features_names)], collapse = ", "), "]"))
location_features <- merge(location_features, features, all = TRUE)
}
} else {
for(feature in provider[["FEATURES"]])
location_features[,feature] <- NA
}
location_features <- location_features %>% separate(col = local_segment,
into = c("local_segment_label", "local_start_date", "local_start_time", "local_end_date", "local_end_time"),
sep = "#",
remove = FALSE)
write.csv(location_features, snakemake@output[[1]], row.names = FALSE)

View File

@ -0,0 +1,39 @@
import pandas as pd
from importlib import import_module, util
from pathlib import Path
# import filter_data_by_segment from src/features/utils/utils.py
spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py"))
mod = util.module_from_spec(spec)
spec.loader.exec_module(mod)
filter_data_by_segment = getattr(mod, "filter_data_by_segment")
rapids_log_tag = getattr(mod, "rapids_log_tag")
location_data = pd.read_csv(snakemake.input["location_data"][0])
day_segments_labels = pd.read_csv(snakemake.input["day_segments_labels"], header=0)
mypath = snakemake.params["mypath"]
provider = snakemake.params["provider"]
provider_key = snakemake.params["provider_key"]
location_features = pd.DataFrame(columns=["local_segment"])
if "FEATURES" not in provider:
raise ValueError("Provider config[LOCATION][PROVIDERS][{}] is missing a FEATURES attribute in config.yaml".format(provider_key))
if provider["COMPUTE"] == True:
code_path = provider["SRC_FOLDER"] + ".main"
feature_module = import_module(code_path)
feature_function = getattr(feature_module, provider["SRC_FOLDER"] + "_location_features")
for day_segment in day_segments_labels["label"]:
print("{} Processing {} {}".format(rapids_log_tag, provider_key, day_segment))
features = feature_function(location_data, day_segment, provider, filter_data_by_segment=filter_data_by_segment)
location_features = location_features.merge(features, how="outer")
else:
for feature in provider["FEATURES"]:
location_features[feature] = None
segment_colums = pd.DataFrame()
segment_colums[["local_segment_label", "local_start_date", "local_start_time", "local_end_date", "local_end_time"]] = location_features["local_segment"].str.split(pat="#", expand=True)
for i in range(segment_colums.shape[1]):
location_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]])
location_features.to_csv(snakemake.output[0], index=False)

View File

@ -1,89 +0,0 @@
source("renv/activate.R")
# Load Ian Barnett's code. Taken from https://scholar.harvard.edu/ibarnett/software/gpsmobility
file.sources = list.files(c("src/features/location_barnett"), pattern="*.R$", full.names=TRUE, ignore.case=TRUE)
sapply(file.sources,source,.GlobalEnv)
library(dplyr)
write_empty_file <- function(file_path, requested_features){
write.csv(data.frame(local_date= character(),
location_barnett_hometime= numeric(),
location_barnett_disttravelled= numeric(),
location_barnett_rog= numeric(),
location_barnett_maxdiam= numeric(),
location_barnett_maxhomedist= numeric(),
location_barnett_siglocsvisited= numeric(),
location_barnett_avgflightlen= numeric(),
location_barnett_stdflightlen= numeric(),
location_barnett_avgflightdur= numeric(),
location_barnett_stdflightdur= numeric(),
location_barnett_probpause= numeric(),
location_barnett_siglocentropy= numeric(),
location_barnett_minsmissing= numeric(),
location_barnett_circdnrtn= numeric(),
location_barnett_wkenddayrtn= numeric(),
minutes_data_used= numeric()
) %>% select(requested_features), file_path, row.names = F)
}
location <- read.csv(snakemake@input[["locations"]], stringsAsFactors = F)
# The choice between RESAMPLE_FUSED and the original location data happens at the rule level in the function
# optional_location_input in features.snakefile
locations_to_use <- snakemake@params[["locations_to_use"]]
accuracy_limit <- snakemake@params[["accuracy_limit"]]
timezone <- snakemake@params[["timezone"]]
minutes_data_used <- snakemake@params[["minutes_data_used"]]
requested_features <- intersect(unlist(snakemake@params["features"], use.names = F),
c("hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","minsmissing","circdnrtn","wkenddayrtn"))
requested_features <- c("local_date", paste("location_barnett", requested_features, sep = "_"))
if(minutes_data_used)
requested_features <- c(requested_features, "minutes_data_used")
if(!locations_to_use %in% c("ALL_EXCEPT_FUSED", "RESAMPLE_FUSED", "ALL")){
print("Unkown filter, provide one of the following three: ALL, ALL_EXCEPT_FUSED, or RESAMPLE_FUSED")
quit(save = "no", status = 1, runLast = FALSE)
}
# excludes fused and resample
if(locations_to_use == "ALL_EXCEPT_FUSED")
location <- location %>% filter(provider == "gps")
# Remove 0,0 location coordinates
location <- location %>% filter(double_latitude != 0 & double_longitude != 0)
# Excludes datasets with less than 24 hours of data
if(max(location$timestamp) - min(location$timestamp) < 86400000)
location <- head(location, 0)
if (nrow(location) > 1){
# Count how many minutes of data we use to get location features
# Some minutes have multiple fused rows
location_minutes_used <- location %>%
group_by(local_date, local_hour) %>%
summarise(n_minutes = n_distinct(local_minute)) %>%
group_by(local_date) %>%
summarise(minutes_data_used = sum(n_minutes)) %>%
select(local_date, minutes_data_used)
location <- location %>%
select(timestamp, latitude = double_latitude, longitude = double_longitude, altitude = double_altitude, accuracy)
outputMobility <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone)
if(is.null(outputMobility)){
write_empty_file(snakemake@output[[1]], requested_features)
} else{
# Copy index (dates) as a column
features <- cbind(rownames(outputMobility$featavg), outputMobility$featavg)
features <- as.data.frame(features)
features[-1] <- lapply(lapply(features[-1], as.character), as.numeric)
colnames(features)=c("local_date",tolower(paste("location_barnett", colnames(outputMobility$featavg), sep = "_")))
# Add the minute count column
features <- left_join(features, location_minutes_used, by = "local_date")
write.csv(features %>% select(requested_features), snakemake@output[[1]], row.names = F)
}
} else {
write_empty_file(snakemake@output[[1]], requested_features)
}

View File

@ -1,24 +0,0 @@
import pandas as pd
from location_doryab.location_base import base_location_features
location_data = pd.read_csv(snakemake.input[0], parse_dates=["local_date_time", "local_date"])
day_segment = snakemake.params["day_segment"]
requested_features = snakemake.params["features"]
location_features = pd.DataFrame(columns=["local_date"])
dbscan_eps = snakemake.params["dbscan_eps"]
dbscan_minsamples = snakemake.params["dbscan_minsamples"]
threshold_static = snakemake.params["threshold_static"]
maximum_gap_allowed = snakemake.params["maximum_gap_allowed"]
minutes_data_used = snakemake.params["minutes_data_used"]
sampling_frequency = snakemake.params["sampling_frequency"]
if(minutes_data_used):
requested_features.append("minutesdataused")
base_features = base_location_features(location_data, day_segment, requested_features, dbscan_eps, dbscan_minsamples,threshold_static,maximum_gap_allowed,sampling_frequency)
location_features = location_features.merge(base_features, on="local_date", how="outer")
assert len(requested_features) + 1 == location_features.shape[1], "The number of features in the output dataframe (=" + str(location_features.shape[1]) + ") does not match the expected value (=" + str(len(requested_features)) + " + 1). Verify your location feature extraction functions"
location_features.to_csv(snakemake.output[0], index=False)

View File

@ -0,0 +1,12 @@
library("stringr")
filter_data_by_segment <- function(data, day_segment){
# Filter the rows that belong to day_segment, and put the segment full name in a new column for grouping
date_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2}"
hour_regex = "[0-9]{2}:[0-9]{2}:[0-9]{2}"
data <- data %>%
filter(grepl(paste0("\\[", day_segment, "#"), assigned_segments)) %>%
mutate(local_segment = str_extract(assigned_segments, paste0("\\[", day_segment, "#", date_regex, "#", hour_regex, "#", date_regex, "#", hour_regex, "\\]")),
local_segment = str_sub(local_segment, 2, -2)) # get rid of first and last character([])
return(data)
}
rapids_log_tag <- "RAPIDS:"

View File

@ -0,0 +1,9 @@
def filter_data_by_segment(data, day_segment):
date_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2}"
hour_regex = "[0-9]{2}:[0-9]{2}:[0-9]{2}"
segment_regex = "\[({}#{}#{}#{}#{})\]".format(day_segment, date_regex, hour_regex, date_regex, hour_regex)
data["local_segment"] = data["assigned_segments"].str.extract(segment_regex, expand=True)
return(data.dropna(subset = ["local_segment"]))
rapids_log_tag = "RAPIDS:"