Migrate location providers to new file structure and segments
parent
3052693573
commit
b0f1477d7e
34
Snakefile
34
Snakefile
|
@ -43,17 +43,6 @@ if config["CALLS"]["COMPUTE"]:
|
|||
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"]))
|
||||
files_to_compute.extend(expand("data/processed/{pid}/calls_{call_type}.csv", pid=config["PIDS"], call_type=config["CALLS"]["TYPES"]))
|
||||
|
||||
if config["BARNETT_LOCATION"]["COMPUTE"]:
|
||||
if config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED":
|
||||
if config["BARNETT_LOCATION"]["DB_TABLE"] in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]:
|
||||
files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_resampled.csv", pid=config["PIDS"], sensor=config["BARNETT_LOCATION"]["DB_TABLE"]))
|
||||
else:
|
||||
raise ValueError("Error: Add your locations table (and as many sensor tables as you have) to [PHONE_VALID_SENSED_BINS][DB_TABLES] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)")
|
||||
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["BARNETT_LOCATION"]["DB_TABLE"]))
|
||||
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["BARNETT_LOCATION"]["DB_TABLE"]))
|
||||
files_to_compute.extend(expand("data/processed/{pid}/location_barnett_{day_segment}.csv", pid=config["PIDS"], day_segment = config["BARNETT_LOCATION"]["DAY_SEGMENTS"]))
|
||||
|
||||
if config["BLUETOOTH"]["COMPUTE"]:
|
||||
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["BLUETOOTH"]["DB_TABLE"]))
|
||||
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["BLUETOOTH"]["DB_TABLE"]))
|
||||
|
@ -142,16 +131,19 @@ if config["CONVERSATION"]["COMPUTE"]:
|
|||
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=pids, sensor=table))
|
||||
files_to_compute.extend(expand("data/processed/{pid}/conversation_{day_segment}.csv",pid=config["PIDS"], day_segment = config["CONVERSATION"]["DAY_SEGMENTS"]))
|
||||
|
||||
if config["DORYAB_LOCATION"]["COMPUTE"]:
|
||||
if config["DORYAB_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED":
|
||||
if config["DORYAB_LOCATION"]["DB_TABLE"] in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]:
|
||||
files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_resampled.csv", pid=config["PIDS"], sensor=config["DORYAB_LOCATION"]["DB_TABLE"]))
|
||||
else:
|
||||
raise ValueError("Error: Add your locations table (and as many sensor tables as you have) to [PHONE_VALID_SENSED_BINS][DB_TABLES] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)")
|
||||
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["DORYAB_LOCATION"]["DB_TABLE"]))
|
||||
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["DORYAB_LOCATION"]["DB_TABLE"]))
|
||||
files_to_compute.extend(expand("data/processed/{pid}/location_doryab_{segment}.csv", pid=config["PIDS"], segment = config["DORYAB_LOCATION"]["DAY_SEGMENTS"]))
|
||||
for provider in config["LOCATIONS"]["PROVIDERS"].keys():
|
||||
if config["LOCATIONS"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
if config["LOCATIONS"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED":
|
||||
if config["LOCATIONS"]["DB_TABLE"] in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]:
|
||||
files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"]))
|
||||
else:
|
||||
raise ValueError("Error: Add your locations table (and as many sensor tables as you have) to [PHONE_VALID_SENSED_BINS][DB_TABLES] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)")
|
||||
|
||||
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["LOCATIONS"]["DB_TABLE"]))
|
||||
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["LOCATIONS"]["DB_TABLE"]))
|
||||
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_processed_{locations_to_use}.csv", pid=config["PIDS"], sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"]))
|
||||
files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["LOCATIONS"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="LOCATIONS".lower()))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="LOCATIONS".lower()))
|
||||
|
||||
# visualization for data exploration
|
||||
if config["HEATMAP_FEATURES_CORRELATIONS"]["PLOT"]:
|
||||
|
|
49
config.yaml
49
config.yaml
|
@ -67,33 +67,34 @@ APPLICATION_GENRES:
|
|||
UPDATE_CATALOGUE_FILE: false # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
|
||||
SCRAPE_MISSING_GENRES: false # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
|
||||
|
||||
RESAMPLE_FUSED_LOCATION:
|
||||
CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold
|
||||
TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row
|
||||
TIMEZONE: *timezone
|
||||
|
||||
BARNETT_LOCATION:
|
||||
COMPUTE: False
|
||||
LOCATIONS:
|
||||
DB_TABLE: locations
|
||||
DAY_SEGMENTS: [daily] # These features are only available on a daily basis
|
||||
FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"]
|
||||
LOCATIONS_TO_USE: ALL # ALL, ALL_EXCEPT_FUSED OR RESAMPLE_FUSED
|
||||
ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius
|
||||
LOCATIONS_TO_USE: FUSED_RESAMPLED # ALL, GPS OR FUSED_RESAMPLED
|
||||
FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold
|
||||
FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row
|
||||
TIMEZONE: *timezone
|
||||
MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features
|
||||
|
||||
PROVIDERS:
|
||||
DORYAB:
|
||||
COMPUTE: True
|
||||
FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"]
|
||||
DBSCAN_EPS: 10 # meters
|
||||
DBSCAN_MINSAMPLES: 5
|
||||
THRESHOLD_STATIC : 1 # km/h
|
||||
MAXIMUM_GAP_ALLOWED: 300
|
||||
MINUTES_DATA_USED: True
|
||||
SAMPLING_FREQUENCY: 0
|
||||
SRC_FOLDER: "doryab"
|
||||
SRC_LANGUAGE: "python"
|
||||
|
||||
DORYAB_LOCATION:
|
||||
COMPUTE: False
|
||||
DB_TABLE: locations
|
||||
DAY_SEGMENTS: *day_segments
|
||||
FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"]
|
||||
LOCATIONS_TO_USE: ALL # ALL, ALL_EXCEPT_FUSED OR RESAMPLE_FUSED
|
||||
DBSCAN_EPS: 10 # meters
|
||||
DBSCAN_MINSAMPLES: 5
|
||||
THRESHOLD_STATIC : 1 # km/h
|
||||
MAXIMUM_GAP_ALLOWED: 300
|
||||
MINUTES_DATA_USED: False
|
||||
SAMPLING_FREQUENCY: 0
|
||||
BARNETT:
|
||||
COMPUTE: True
|
||||
FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"]
|
||||
ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius
|
||||
TIMEZONE: *timezone
|
||||
MINUTES_DATA_USED: True # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features
|
||||
SRC_FOLDER: "barnett"
|
||||
SRC_LANGUAGE: "r"
|
||||
|
||||
BLUETOOTH:
|
||||
COMPUTE: False
|
||||
|
|
|
@ -579,17 +579,18 @@ are computed. See Ian Barnett, Jukka-Pekka Onnela, Inferring mobility measures f
|
|||
|
||||
See `Location (Barnett’s) Config Code`_
|
||||
|
||||
**Available Epochs (day_segment) :** daily
|
||||
**Available Day Segments (epochs) :** only daily periods of EVERY_DAY_INTERVAL or FLEXIBLE_DAY_INTERVAL (periods that start at 00:00:00 and end at 23:59:59 on the same day)
|
||||
|
||||
**Available Platforms:** Android and iOS
|
||||
|
||||
**Snakemake rule chain:**
|
||||
|
||||
- Rule ``rules/preprocessing.snakefile/download_dataset``
|
||||
- Rule ``rules/preprocessing.snakefile/readable_datetime``
|
||||
- Rule ``rules/preprocessing.snakefile/phone_sensed_bins``
|
||||
- Rule ``rules/preprocessing.snakefile/resample_fused_location`` (only relevant if setting ``location_to_use`` to ````RESAMPLE_FUSED``.
|
||||
- Rule ``rules/features.snakefile/location_barnett_features``
|
||||
- Rule ``rules/preprocessing.snakefile/download_dataset`` (de duplication and sorting by timestamp)
|
||||
- Rule ``rules/preprocessing.snakefile/readable_datetime`` (add local date and time components, add local day segment)
|
||||
- Rule ``rules/preprocessing.snakefile/phone_sensed_bins`` (get the periods of time the phone was sensing data to resample over them)
|
||||
- Rule ``rules/preprocessing.snakefile/process_location_types`` (filter gps data or resample fused location, deletes (0,0) coordinates)
|
||||
- Rule ``rules/features.snakefile/locations_r_features`` (RAPIDS executes ``barnett_location_features`` from ``src/features/location/barnett/main.R`)
|
||||
- Rule ``rules/features.snakefile/join_features_from_providers`` (joins the location features of all python and r providers)
|
||||
|
||||
.. _location-parameters:
|
||||
|
||||
|
@ -598,7 +599,7 @@ See `Location (Barnett’s) Config Code`_
|
|||
================= ===================
|
||||
Name Description
|
||||
================= ===================
|
||||
location_to_use *Read the Observations section below*. The specifies what type of location data will be use in the analysis. Possible options are ``ALL``, ``ALL_EXCEPT_FUSED`` OR ``RESAMPLE_FUSED``
|
||||
location_to_use *Read the Observations section below*. The specifies what type of location data will be use in the analysis. Possible options are ``ALL``, ``GPS`` OR ``RESAMPLE_FUSED``
|
||||
accuracy_limit This is in meters. The sensor drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius specified.
|
||||
timezone The timezone used to calculate location.
|
||||
minutes_data_used This is NOT a feature. This is just a quality control check, and if set to TRUE, a new column is added to the output file with the number of minutes containing location data that were used to compute all features. The more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough.
|
||||
|
@ -634,15 +635,15 @@ wkenddayrtn Same as circdnrtn but computed separately for w
|
|||
|
||||
*Types of location data to use*
|
||||
|
||||
Aware Android and iOS clients can collect location coordinates through the phone's GPS or Google's fused location API. If your Aware client was ONLY configured to use GPS set ``location_to_use`` to ``ALL``, if your client was configured to use BOTH GPS and fused location you can use ``ALL`` or set ``location_to_use`` to ``ALL_EXCEPT_FUSED`` to ignore fused coordinates, if your client was configured to use fused location only, set ``location_to_use`` to ``RESAMPLE_FUSED``. ``RESAMPLE_FUSED`` takes the original fused location coordinates and replicates each pair forward in time as long as the phone was sensing data as indicated by ``phone_sensed_bins`` (see :ref:`Phone valid sensed days <phone-valid-sensed-days>`), this is done because Google's API only logs a new location coordinate pair when it is sufficiently different from the previous one.
|
||||
Aware Android and iOS clients can collect location coordinates through the phone's GPS, the network cellular towers around the phone or Google's fused location API. If you want to use only the GPS provider set ``location_to_use`` to ``GPS``, if you want to use all providers (not recommended due to the difference in accuracy) set ``location_to_use`` to ``ALL``, if your Aware client was configured to use fused location only or want to focus only on this provider, set ``location_to_use`` to ``RESAMPLE_FUSED``. ``RESAMPLE_FUSED`` takes the original fused location coordinates and replicates each pair forward in time as long as the phone was sensing data as indicated by ``phone_sensed_bins`` (see :ref:`Phone valid sensed days <phone-valid-sensed-days>`), this is done because Google's API only logs a new location coordinate pair when it is sufficiently different in time or space from the previous one.
|
||||
|
||||
There are two parameters associated with resampling fused location in the ``RESAMPLE_FUSED_LOCATION`` section of the ``config.yaml`` file. ``CONSECUTIVE_THRESHOLD`` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair (for example, participant A's phone did not collect data between 10.30am and 10:50am and between 11:05am and 11:40am, the last known coordinate pair will be replicated during the first period but not the second, in other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes). ``TIME_SINCE_VALID_LOCATION`` (in minutes, default 720 or 12 hours) the last known fused location won't be carried over longer that this threshold even if the phone was sensing data continuously (for example, participant A went home at 9pm and their phone was sensing data without gaps until 11am the next morning, the last known location will only be replicated until 9am). If you have suggestions to modify or improve this imputation, let us know.
|
||||
There are two parameters associated with resampling fused location in the ``LOCATIONS`` section of the ``config.yaml`` file. ``RESAMPLE_FUSED_CONSECUTIVE_THRESHOLD`` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair (for example, participant A's phone did not collect data between 10.30am and 10:50am and between 11:05am and 11:40am, the last known coordinate pair will be replicated during the first period but not the second, in other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes). ``RESAMPLE_FUSED_TIME_SINCE_VALID_LOCATION`` (in minutes, default 720 or 12 hours) makes that the last known fused location won't be carried over longer that this threshold even if the phone was sensing data continuously (for example, participant A went home at 9pm and their phone was sensing data without gaps until 11am the next morning, the last known location will only be replicated until 9am). If you have suggestions to modify or improve this imputation, let us know.
|
||||
|
||||
*Barnett's et al features*
|
||||
|
||||
These features are based on a Pause-Flight model. A pause is defined as a mobiity trace (location pings) within a certain duration and distance (by default 300 seconds and 60 meters). A flight is any mobility trace between two pauses. Data is resampled and imputed before the features are computed. See this paper for more information: https://doi.org/10.1093/biostatistics/kxy059.
|
||||
|
||||
In RAPIDS we only expose two parameters for these features (timezone and accuracy). If you wish to change others you can do so in ``src/features/location_barnett/MobilityFeatures.R``
|
||||
In RAPIDS we only expose two parameters for these features (timezone and accuracy). If you wish to change others you can do so in ``src/features/location/barnett/library/MobilityFeatures.R``
|
||||
|
||||
*Significant Locations*
|
||||
|
||||
|
@ -660,17 +661,18 @@ Doryab's location features are based on this paper: Doryab, A., Chikarsel, P., L
|
|||
|
||||
See `Location (Doryab's) Config Code`_
|
||||
|
||||
**Available Epochs (day_segment) :** daily, morning, afternoon, evening, night
|
||||
**Available Day Segments (epochs):** any of EVERY_DAY_FREQUENCY, EVERY_DAY_INTERVAL and FLEXIBLE_DAY_INTERVAL
|
||||
|
||||
**Available Platforms:** Android and iOS
|
||||
|
||||
**Snakemake rule chain:**
|
||||
|
||||
- Rule ``rules/preprocessing.snakefile/download_dataset``
|
||||
- Rule ``rules/preprocessing.snakefile/readable_datetime``
|
||||
- Rule ``rules/preprocessing.snakefile/phone_sensed_bins``
|
||||
- Rule ``rules/preprocessing.snakefile/resample_fused_location`` (only relevant if setting ``location_to_use`` to ````RESAMPLE_FUSED``.
|
||||
- Rule ``rules/features.snakefile/location_doryab_features``
|
||||
- Rule ``rules/preprocessing.snakefile/download_dataset`` (de duplication and sorting by timestamp)
|
||||
- Rule ``rules/preprocessing.snakefile/readable_datetime`` (add local date and time components, add local day segment)
|
||||
- Rule ``rules/preprocessing.snakefile/phone_sensed_bins`` (get the periods of time the phone was sensing data to resample over them)
|
||||
- Rule ``rules/preprocessing.snakefile/process_location_types`` (filter gps data or resample fused location, deletes (0,0) coordinates)
|
||||
- Rule ``rules/features.snakefile/locations_python_features`` (RAPIDS executes ``doryab_location_features`` from ``src/features/location/doryab/main.py`)
|
||||
- Rule ``rules/features.snakefile/join_features_from_providers`` (joins the location features of all python and r providers)
|
||||
|
||||
.. _location-doryab-parameters:
|
||||
|
||||
|
@ -680,7 +682,7 @@ See `Location (Doryab's) Config Code`_
|
|||
Name Description
|
||||
=================== ===================
|
||||
day_segment The particular ``day_segment`` that will be analyzed. The available options are ``daily``, ``morning``, ``afternoon``, ``evening``, ``night``
|
||||
location_to_use *Read the Observations section below*. The specifies what type of location data will be use in the analysis. Possible options are ``ALL``, ``ALL_EXCEPT_FUSED`` OR ``RESAMPLE_FUSED``.
|
||||
location_to_use *Read the Observations section below*. The specifies what type of location data will be use in the analysis. Possible options are ``ALL``, ``GPS`` OR ``RESAMPLE_FUSED``.
|
||||
features Features to be computed, see table below.
|
||||
threshold_static It is the threshold value in km/hr which labels a row as Static or Moving.
|
||||
dbscan_minsamples The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself.
|
||||
|
@ -723,9 +725,9 @@ normalizedlocationentropy nats Shannon Entropy computed
|
|||
|
||||
*Types of location data to use*
|
||||
|
||||
Aware Android and iOS clients can collect location coordinates through the phone's GPS or Google's fused location API. If your Aware client was ONLY configured to use GPS set ``location_to_use`` to ``ALL``, if your client was configured to use BOTH GPS and fused location you can use ``ALL`` or set ``location_to_use`` to ``ALL_EXCEPT_FUSED`` to ignore fused coordinates, if your client was configured to use fused location only, set ``location_to_use`` to ``RESAMPLE_FUSED``. ``RESAMPLE_FUSED`` takes the original fused location coordinates and replicates each pair forward in time as long as the phone was sensing data as indicated by ``phone_sensed_bins`` (see :ref:`Phone valid sensed days <phone-valid-sensed-days>`), this is done because Google's API only logs a new location coordinate pair when it is sufficiently different from the previous one.
|
||||
Aware Android and iOS clients can collect location coordinates through the phone's GPS, the network cellular towers around the phone or Google's fused location API. If you want to use only the GPS provider set ``location_to_use`` to ``GPS``, if you want to use all providers (not recommended due to the difference in accuracy) set ``location_to_use`` to ``ALL``, if your Aware client was configured to use fused location only or want to focus only on this provider, set ``location_to_use`` to ``RESAMPLE_FUSED``. ``RESAMPLE_FUSED`` takes the original fused location coordinates and replicates each pair forward in time as long as the phone was sensing data as indicated by ``phone_sensed_bins`` (see :ref:`Phone valid sensed days <phone-valid-sensed-days>`), this is done because Google's API only logs a new location coordinate pair when it is sufficiently different in time or space from the previous one.
|
||||
|
||||
There are two parameters associated with resampling fused location in the ``RESAMPLE_FUSED_LOCATION`` section of the ``config.yaml`` file. ``CONSECUTIVE_THRESHOLD`` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair (for example, participant A's phone did not collect data between 10.30am and 10:50am and between 11:05am and 11:40am, the last known coordinate pair will be replicated during the first period but not the second, in other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes). ``TIME_SINCE_VALID_LOCATION`` (in minutes, default 720 or 12 hours) the last known fused location won't be carried over longer that this threshold even if the phone was sensing data continuously (for example, participant A went home at 9pm and their phone was sensing data without gaps until 11am the next morning, the last known location will only be replicated until 9am). If you have suggestions to modify or improve this imputation, let us know.
|
||||
There are two parameters associated with resampling fused location in the ``LOCATIONS`` section of the ``config.yaml`` file. ``RESAMPLE_FUSED_CONSECUTIVE_THRESHOLD`` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair (for example, participant A's phone did not collect data between 10.30am and 10:50am and between 11:05am and 11:40am, the last known coordinate pair will be replicated during the first period but not the second, in other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes). ``RESAMPLE_FUSED_TIME_SINCE_VALID_LOCATION`` (in minutes, default 720 or 12 hours) makes that the last known fused location won't be carried over longer that this threshold even if the phone was sensing data continuously (for example, participant A went home at 9pm and their phone was sensing data without gaps until 11am the next morning, the last known location will only be replicated until 9am). If you have suggestions to modify or improve this imputation, let us know.
|
||||
|
||||
*Significant Locations Identified*
|
||||
|
||||
|
|
|
@ -26,23 +26,13 @@ def optional_phone_sensed_bins_input(wildcards):
|
|||
|
||||
return expand("data/raw/{{pid}}/{table}_with_datetime.csv", table = tables_platform)
|
||||
|
||||
def find_day_segments_input_file(wildcards):
|
||||
for key, values in config.items():
|
||||
if "DB_TABLE" in config[key] and config[key]["DB_TABLE"] == wildcards.sensor:
|
||||
if "DAY_SEGMENTS" in config[key]:
|
||||
return config[key]["DAY_SEGMENTS"]["FILE"]
|
||||
else:
|
||||
raise ValueError("{} should have a [DAY_SEGMENTS][FILE] parameter containing the path to its day segments file".format(wildcards.sensor))
|
||||
|
||||
def find_day_segments_input_type(wildcards):
|
||||
for key, values in config.items():
|
||||
if "DB_TABLE" in config[key] and config[key]["DB_TABLE"] == wildcards.sensor:
|
||||
if "DAY_SEGMENTS" in config[key]:
|
||||
return config[key]["DAY_SEGMENTS"]["TYPE"]
|
||||
else:
|
||||
raise ValueError("{} should have a [DAY_SEGMENTS][TYPE] parameter containing INTERVAL, FREQUENCY, or EVENT".format(wildcards.sensor))
|
||||
|
||||
# Features.smk #########################################################################################################
|
||||
def find_features_files(wildcards):
|
||||
feature_files = []
|
||||
for provider_key, provider in config[(wildcards.sensor_key).upper()]["PROVIDERS"].items():
|
||||
if provider["COMPUTE"]:
|
||||
feature_files.extend(expand("data/interim/{{pid}}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", sensor_key=(wildcards.sensor_key).lower(), language=provider["SRC_LANGUAGE"].lower(), provider_key=provider_key))
|
||||
return(feature_files)
|
||||
|
||||
def optional_ar_input(wildcards):
|
||||
platform = infer_participant_platform("data/external/"+wildcards.pid)
|
||||
|
@ -62,18 +52,6 @@ def optional_conversation_input(wildcards):
|
|||
elif platform == "ios":
|
||||
return ["data/raw/{pid}/" + config["CONVERSATION"]["DB_TABLE"]["IOS"] + "_with_datetime_unified.csv"]
|
||||
|
||||
def optional_location_barnett_input(wildcards):
|
||||
if config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED":
|
||||
return expand("data/raw/{{pid}}/{sensor}_resampled.csv", sensor=config["BARNETT_LOCATION"]["DB_TABLE"])
|
||||
else:
|
||||
return expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BARNETT_LOCATION"]["DB_TABLE"])
|
||||
|
||||
def optional_location_doryab_input(wildcards):
|
||||
if config["DORYAB_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED":
|
||||
return expand("data/raw/{{pid}}/{sensor}_resampled.csv", sensor=config["DORYAB_LOCATION"]["DB_TABLE"])
|
||||
else:
|
||||
return expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["DORYAB_LOCATION"]["DB_TABLE"])
|
||||
|
||||
def optional_steps_sleep_input(wildcards):
|
||||
if config["STEP"]["EXCLUDE_SLEEP"]["EXCLUDE"] == True and config["STEP"]["EXCLUDE_SLEEP"]["TYPE"] == "FITBIT_BASED":
|
||||
return "data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv"
|
||||
|
|
|
@ -1,3 +1,11 @@
|
|||
rule join_features_from_providers:
|
||||
input:
|
||||
location_features = find_features_files
|
||||
output:
|
||||
"data/processed/features/{pid}/{sensor_key}.csv"
|
||||
script:
|
||||
"../src/features/join_features_from_providers.R"
|
||||
|
||||
rule messages_features:
|
||||
input:
|
||||
expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["MESSAGES"]["DB_TABLE"]),
|
||||
|
@ -54,37 +62,29 @@ rule ios_activity_recognition_deltas:
|
|||
script:
|
||||
"../src/features/activity_recognition_deltas.R"
|
||||
|
||||
rule location_barnett_features:
|
||||
rule locations_python_features:
|
||||
input:
|
||||
locations = optional_location_barnett_input
|
||||
location_data = expand("data/raw/{{pid}}/{sensor}_processed_{locations_to_use}.csv", sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"]),
|
||||
day_segments_labels = "data/interim/day_segments_labels.csv"
|
||||
params:
|
||||
features = config["BARNETT_LOCATION"]["FEATURES"],
|
||||
locations_to_use = config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"],
|
||||
accuracy_limit = config["BARNETT_LOCATION"]["ACCURACY_LIMIT"],
|
||||
timezone = config["BARNETT_LOCATION"]["TIMEZONE"],
|
||||
minutes_data_used = config["BARNETT_LOCATION"]["MINUTES_DATA_USED"],
|
||||
day_segment = "{day_segment}"
|
||||
provider = lambda wildcards: config["LOCATIONS"]["PROVIDERS"][wildcards.provider_key],
|
||||
provider_key = "{provider_key}",
|
||||
output:
|
||||
"data/processed/{pid}/location_barnett_{day_segment}.csv"
|
||||
"data/interim/{pid}/locations_features/locations_python_{provider_key}.csv"
|
||||
script:
|
||||
"../src/features/location_barnett_features.R"
|
||||
"../src/features/location/locations_entry.py"
|
||||
|
||||
rule location_doryab_features:
|
||||
rule locations_r_features:
|
||||
input:
|
||||
locations = optional_location_doryab_input
|
||||
location_data = expand("data/raw/{{pid}}/{sensor}_processed_{locations_to_use}.csv", sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"]),
|
||||
day_segments_labels = "data/interim/day_segments_labels.csv"
|
||||
params:
|
||||
features = config["DORYAB_LOCATION"]["FEATURES"],
|
||||
day_segment = "{day_segment}",
|
||||
dbscan_eps = config["DORYAB_LOCATION"]["DBSCAN_EPS"],
|
||||
dbscan_minsamples = config["DORYAB_LOCATION"]["DBSCAN_MINSAMPLES"],
|
||||
threshold_static = config["DORYAB_LOCATION"]["THRESHOLD_STATIC"],
|
||||
maximum_gap_allowed = config["DORYAB_LOCATION"]["MAXIMUM_GAP_ALLOWED"],
|
||||
minutes_data_used = config["DORYAB_LOCATION"]["MINUTES_DATA_USED"],
|
||||
sampling_frequency = config["DORYAB_LOCATION"]["SAMPLING_FREQUENCY"]
|
||||
provider = lambda wildcards: config["LOCATIONS"]["PROVIDERS"][wildcards.provider_key],
|
||||
provider_key = "{provider_key}"
|
||||
output:
|
||||
"data/processed/{pid}/location_doryab_{day_segment}.csv"
|
||||
"data/interim/{pid}/locations_features/locations_r_{provider_key}.csv"
|
||||
script:
|
||||
"../src/features/location_doryab_features.py"
|
||||
"../src/features/location/locations_entry.R"
|
||||
|
||||
rule bluetooth_features:
|
||||
input:
|
||||
|
|
|
@ -40,17 +40,17 @@ rule download_dataset:
|
|||
|
||||
rule compute_day_segments:
|
||||
input:
|
||||
find_day_segments_input_file
|
||||
config["DAY_SEGMENTS"]["FILE"]
|
||||
params:
|
||||
day_segments_type = find_day_segments_input_type
|
||||
day_segments_type = config["DAY_SEGMENTS"]["TYPE"]
|
||||
output:
|
||||
segments_file = "data/interim/{sensor}_day_segments.csv",
|
||||
segments_labels_file = "data/interim/{sensor}_day_segments_labels.csv",
|
||||
segments_file = "data/interim/day_segments.csv",
|
||||
segments_labels_file = "data/interim/day_segments_labels.csv",
|
||||
script:
|
||||
"../src/data/compute_day_segments.py"
|
||||
|
||||
PHONE_SENSORS = []
|
||||
PHONE_SENSORS.extend([config["MESSAGES"]["DB_TABLE"], config["CALLS"]["DB_TABLE"], config["BARNETT_LOCATION"]["DB_TABLE"], config["DORYAB_LOCATION"]["DB_TABLE"], config["BLUETOOTH"]["DB_TABLE"], config["BATTERY"]["DB_TABLE"], config["SCREEN"]["DB_TABLE"], config["LIGHT"]["DB_TABLE"], config["ACCELEROMETER"]["DB_TABLE"], config["APPLICATIONS_FOREGROUND"]["DB_TABLE"], config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]])
|
||||
PHONE_SENSORS.extend([config["MESSAGES"]["DB_TABLE"], config["CALLS"]["DB_TABLE"], config["LOCATIONS"]["DB_TABLE"], config["BLUETOOTH"]["DB_TABLE"], config["BATTERY"]["DB_TABLE"], config["SCREEN"]["DB_TABLE"], config["LIGHT"]["DB_TABLE"], config["ACCELEROMETER"]["DB_TABLE"], config["APPLICATIONS_FOREGROUND"]["DB_TABLE"], config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]])
|
||||
PHONE_SENSORS.extend(config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"])
|
||||
|
||||
if len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0:
|
||||
|
@ -62,11 +62,11 @@ if len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) > 0:
|
|||
rule readable_datetime:
|
||||
input:
|
||||
sensor_input = "data/raw/{pid}/{sensor}_raw.csv",
|
||||
day_segments = "data/interim/{sensor}_day_segments.csv"
|
||||
day_segments = "data/interim/day_segments.csv"
|
||||
params:
|
||||
timezones = None,
|
||||
fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"],
|
||||
day_segments_type = find_day_segments_input_type
|
||||
day_segments_type = config["DAY_SEGMENTS"]["TYPE"]
|
||||
wildcard_constraints:
|
||||
sensor = '.*(' + '|'.join([re.escape(x) for x in PHONE_SENSORS]) + ').*' # only process smartphone sensors, not fitbit
|
||||
output:
|
||||
|
@ -108,19 +108,22 @@ rule unify_ios_android:
|
|||
script:
|
||||
"../src/data/unify_ios_android.R"
|
||||
|
||||
rule resample_fused_location:
|
||||
rule process_location_types:
|
||||
input:
|
||||
locations = "data/raw/{pid}/{sensor}_raw.csv",
|
||||
phone_sensed_bins = rules.phone_sensed_bins.output
|
||||
locations = "data/raw/{pid}/{sensor}_with_datetime.csv",
|
||||
phone_sensed_bins = rules.phone_sensed_bins.output,
|
||||
day_segments = "data/interim/day_segments.csv"
|
||||
params:
|
||||
bin_size = config["PHONE_VALID_SENSED_BINS"]["BIN_SIZE"],
|
||||
timezone = config["RESAMPLE_FUSED_LOCATION"]["TIMEZONE"],
|
||||
consecutive_threshold = config["RESAMPLE_FUSED_LOCATION"]["CONSECUTIVE_THRESHOLD"],
|
||||
time_since_valid_location = config["RESAMPLE_FUSED_LOCATION"]["TIME_SINCE_VALID_LOCATION"]
|
||||
timezone = config["LOCATIONS"]["TIMEZONE"],
|
||||
consecutive_threshold = config["LOCATIONS"]["FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD"],
|
||||
time_since_valid_location = config["LOCATIONS"]["FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION"],
|
||||
day_segments_type = config["DAY_SEGMENTS"]["TYPE"],
|
||||
locations_to_use = "{locations_to_used}"
|
||||
output:
|
||||
"data/raw/{pid}/{sensor}_resampled.csv"
|
||||
"data/raw/{pid}/{sensor}_processed_{locations_to_used}.csv"
|
||||
script:
|
||||
"../src/data/resample_fused_location.R"
|
||||
"../src/data/process_location_types.R"
|
||||
|
||||
rule application_genres:
|
||||
input:
|
||||
|
|
|
@ -0,0 +1,85 @@
|
|||
library("tidyverse")
|
||||
library("lubridate")
|
||||
|
||||
assign_to_day_segment <- function(data, day_segments, day_segments_type, fixed_timezone){
|
||||
|
||||
if(day_segments_type == "FREQUENCY_EVERY_DAY"){
|
||||
data <- data %>% mutate(local_date_time_obj = lubridate::parse_date_time(local_time, orders = c("HMS", "HM")))
|
||||
day_segments <- day_segments %>% mutate(start_time = lubridate::parse_date_time(start_time, orders = c("HMS", "HM")),
|
||||
end_time = start_time + minutes(length))
|
||||
|
||||
# Create a new column for each day_segment
|
||||
for(row_id in 1:nrow(day_segments)){
|
||||
row = day_segments[row_id,]
|
||||
data <- data %>% mutate(!!paste("local_day_segment", row_id, sep = "_") := ifelse(local_date_time_obj >= row$start_time & local_date_time_obj < row$end_time,
|
||||
paste0("[",
|
||||
row$label, "_",
|
||||
local_date, "_",
|
||||
paste(str_pad(hour(row$start_time),2, pad="0"), str_pad(minute(row$start_time),2, pad="0"), str_pad(second(row$start_time),2, pad="0"),sep =":"),
|
||||
"]"), NA))
|
||||
}
|
||||
|
||||
} else if (day_segments_type == "INTERVAL_EVERY_DAY"){
|
||||
|
||||
data_dates <- data %>% select(local_date) %>% distinct(local_date)
|
||||
inferred_day_segments <- crossing(day_segments, data_dates) %>%
|
||||
mutate(start_local_date_time_obj = lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = fixed_timezone),
|
||||
end_local_date_time_obj = start_local_date_time_obj + lubridate::period(length),
|
||||
date_time_interval = lubridate::interval(start_local_date_time_obj, end_local_date_time_obj)) %>%
|
||||
group_by(label, local_date) %>%
|
||||
mutate(group_start_datetime = lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = fixed_timezone),
|
||||
group_end_datetime = group_start_datetime + lubridate::period(length),
|
||||
group_start_datetime = min(group_start_datetime),
|
||||
group_end_datetime = max(group_end_datetime)) %>%
|
||||
ungroup()
|
||||
|
||||
|
||||
data <- data %>% mutate(local_date_time_obj = lubridate::ymd_hms(local_date_time, tz = fixed_timezone))
|
||||
|
||||
# Create a new column for each day_segment
|
||||
for(row_id in 1:nrow(inferred_day_segments)){
|
||||
row = inferred_day_segments[row_id,]
|
||||
data <- data %>% mutate(!!paste("local_day_segment", row_id, sep = "_") := ifelse(local_date_time_obj %within% row$date_time_interval,
|
||||
paste0("[",
|
||||
paste(sep= "#",
|
||||
row$label,
|
||||
lubridate::date(row$group_start_datetime),
|
||||
paste(str_pad(hour(row$group_start_datetime),2, pad="0"), str_pad(minute(row$group_start_datetime),2, pad="0"), str_pad(second(row$group_start_datetime),2, pad="0"),sep =":"),
|
||||
lubridate::date(row$group_end_datetime),
|
||||
paste(str_pad(hour(row$group_end_datetime),2, pad="0"), str_pad(minute(row$group_end_datetime),2, pad="0"), str_pad(second(row$group_end_datetime),2, pad="0"),sep =":")
|
||||
),
|
||||
"]"), NA))
|
||||
}
|
||||
|
||||
|
||||
} else if ( day_segments_type == "INTERVAL_FLEXIBLE_DAY"){
|
||||
data <- data %>% mutate(local_date_time_obj = lubridate::ymd_hms(local_date_time, tz = fixed_timezone))
|
||||
day_segments <- day_segments %>% mutate(shift = ifelse(shift == "0", "0seconds", shift),
|
||||
start_local_date_time_obj = lubridate::ymd_hms(start_date_time, tz = fixed_timezone) + (lubridate::period(shift) * ifelse(shift_direction >= 0, 1, -1)),
|
||||
end_local_date_time_obj = start_local_date_time_obj + lubridate::period(length),
|
||||
date_time_interval = lubridate::interval(start_local_date_time_obj, end_local_date_time_obj))
|
||||
|
||||
# Create a new column for each day_segment
|
||||
for(row_id in 1:nrow(day_segments)){
|
||||
row = day_segments[row_id,]
|
||||
print(row$length)
|
||||
data <- data %>% mutate(!!paste("local_day_segment", row_id, sep = "_") := ifelse(local_date_time_obj %within% row$date_time_interval,
|
||||
paste0("[",
|
||||
paste(sep= "#",
|
||||
row$label,
|
||||
lubridate::date(row$start_local_date_time_obj),
|
||||
paste(str_pad(hour(row$start_local_date_time_obj),2, pad="0"), str_pad(minute(row$start_local_date_time_obj),2, pad="0"), str_pad(second(row$start_local_date_time_obj),2, pad="0"),sep =":"),
|
||||
lubridate::date(row$end_local_date_time_obj),
|
||||
paste(str_pad(hour(row$end_local_date_time_obj),2, pad="0"), str_pad(minute(row$end_local_date_time_obj),2, pad="0"), str_pad(second(row$end_local_date_time_obj),2, pad="0"),sep =":")
|
||||
),
|
||||
"]"), NA))
|
||||
}
|
||||
}
|
||||
|
||||
# Join all day_segments in a single column
|
||||
data <- data %>%
|
||||
unite("assigned_segments", starts_with("local_day_segment"), sep = "|", na.rm = TRUE) %>%
|
||||
select(-local_date_time_obj)
|
||||
|
||||
return(data)
|
||||
}
|
|
@ -0,0 +1,76 @@
|
|||
source("renv/activate.R")
|
||||
library(dplyr)
|
||||
library(readr)
|
||||
library(tidyr)
|
||||
|
||||
source("src/data/assign_to_day_segment.R")
|
||||
|
||||
bin_size <- snakemake@params[["bin_size"]]
|
||||
timezone <- snakemake@params[["timezone"]]
|
||||
consecutive_threshold <- snakemake@params[["consecutive_threshold"]]
|
||||
time_since_valid_location <- snakemake@params[["time_since_valid_location"]]
|
||||
location_to_used <- snakemake@params[["time_since_valocation_to_usedlid_location"]]
|
||||
day_segments <- read.csv(snakemake@input[["day_segments"]])
|
||||
day_segments_type <- snakemake@params[["day_segments_type"]]
|
||||
|
||||
phone_sensed_bins <- read_csv(snakemake@input[["phone_sensed_bins"]], col_types = cols(local_date = col_character()))
|
||||
locations <- read_csv(snakemake@input[["locations"]], col_types = cols()) %>% filter(provider == "fused") %>%
|
||||
filter(double_latitude != 0 & double_longitude != 0)
|
||||
|
||||
locations_to_use <- snakemake@params["locations_to_use"]
|
||||
if(!locations_to_use %in% c("ALL", "FUSED_RESAMPLED", "GPS")){
|
||||
print("Unkown location filter, provide one of the following three: ALL, GPS, or FUSED_RESAMPLED")
|
||||
quit(save = "no", status = 1, runLast = FALSE)
|
||||
}
|
||||
|
||||
|
||||
if(locations_to_use == "ALL"){
|
||||
processed_locations <- locations
|
||||
} else if(locations_to_use == "GPS"){
|
||||
processed_locations <- locations %>% filter(provider == "gps")
|
||||
} else if(locations_to_use == "FUSED_RESAMPLED"){
|
||||
locations <- locations %>% filter(provider == "fused")
|
||||
if(nrow(locations) > 0){
|
||||
sensed_minute_bins <- phone_sensed_bins %>%
|
||||
pivot_longer(-local_date, names_to = c("hour", "bin"), names_sep = "_", values_to = "sensor_count") %>%
|
||||
mutate(hour = as.integer(hour), bin = as.integer(bin)) %>%
|
||||
complete(nesting(local_date, hour), bin = seq(0, 59,1)) %>%
|
||||
fill(sensor_count) %>%
|
||||
mutate(timestamp = as.numeric(as.POSIXct(paste0(local_date, " ", hour,":", bin,":00"), format = "%Y-%m-%d %H:%M:%S", tz = timezone)) * 1000 ) %>%
|
||||
filter(sensor_count > 0) %>%
|
||||
select(timestamp)
|
||||
|
||||
resampled_locations <- locations %>%
|
||||
select(-assigned_segments) %>%
|
||||
bind_rows(sensed_minute_bins) %>%
|
||||
mutate(provider = replace_na(provider, "resampled")) %>%
|
||||
arrange(timestamp) %>%
|
||||
# We group and therefore, fill in, missing rows that appear after a valid fused location record and exist
|
||||
# within consecutive_threshold minutes from each other
|
||||
mutate(consecutive_time_diff = c(1, diff(timestamp)),
|
||||
resample_group = cumsum(!is.na(double_longitude) | consecutive_time_diff > (1000 * 60 * consecutive_threshold))) %>%
|
||||
group_by(resample_group) %>%
|
||||
# drop rows that are logged after time_since_valid_location minutes from the last valid fused location
|
||||
filter((timestamp - first(timestamp) < (1000 * 60 * time_since_valid_location))) %>%
|
||||
fill(-timestamp, -resample_group) %>%
|
||||
select(-consecutive_time_diff) %>%
|
||||
drop_na(double_longitude, double_latitude, accuracy) %>%
|
||||
# Add local date_time
|
||||
mutate(utc_date_time = as.POSIXct(timestamp/1000, origin="1970-01-01", tz="UTC"),
|
||||
local_date_time = format(utc_date_time, tz = timezone, usetz = F)) %>%
|
||||
separate(local_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>%
|
||||
separate(local_time, c("local_hour", "local_minute"), ":", remove = FALSE, extra = "drop") %>%
|
||||
mutate(local_hour = as.numeric(local_hour),
|
||||
local_minute = as.numeric(local_minute)) %>%
|
||||
# Delete resampled rows that exist in the same minute as other original (fused) rows
|
||||
group_by(local_date, local_hour, local_minute) %>%
|
||||
mutate(n = n()) %>%
|
||||
filter(n == 1 | (n > 1 & provider == "fused")) %>%
|
||||
select(-n) %>%
|
||||
ungroup()
|
||||
processed_locations <- assign_to_day_segment(resampled_locations, day_segments, day_segments_type, timezone)
|
||||
} else {
|
||||
processed_locations <- locations
|
||||
}
|
||||
}
|
||||
write.csv(processed_locations,snakemake@output[[1]], row.names = F)
|
|
@ -1,8 +1,8 @@
|
|||
source("renv/activate.R")
|
||||
|
||||
library("tidyverse")
|
||||
library("readr")
|
||||
library("lubridate")
|
||||
|
||||
source("src/data/assign_to_day_segment.R")
|
||||
|
||||
input <- read.csv(snakemake@input[["sensor_input"]]) %>% arrange(timestamp)
|
||||
day_segments <- read.csv(snakemake@input[["day_segments"]])
|
||||
|
@ -11,89 +11,6 @@ sensor_output <- snakemake@output[[1]]
|
|||
timezone_periods <- snakemake@params[["timezone_periods"]]
|
||||
fixed_timezone <- snakemake@params[["fixed_timezone"]]
|
||||
|
||||
assign_to_day_segment <- function(data, day_segments, day_segments_type, fixed_timezone){
|
||||
|
||||
if(day_segments_type == "FREQUENCY_EVERY_DAY"){
|
||||
data <- data %>% mutate(local_date_time_obj = lubridate::parse_date_time(local_time, orders = c("HMS", "HM")))
|
||||
day_segments <- day_segments %>% mutate(start_time = lubridate::parse_date_time(start_time, orders = c("HMS", "HM")),
|
||||
end_time = start_time + minutes(length))
|
||||
|
||||
# Create a new column for each day_segment
|
||||
for(row_id in 1:nrow(day_segments)){
|
||||
row = day_segments[row_id,]
|
||||
data <- data %>% mutate(!!paste("local_day_segment", row_id, sep = "_") := ifelse(local_date_time_obj >= row$start_time & local_date_time_obj < row$end_time,
|
||||
paste0("[",
|
||||
row$label, "_",
|
||||
local_date, "_",
|
||||
paste(str_pad(hour(row$start_time),2, pad="0"), str_pad(minute(row$start_time),2, pad="0"), str_pad(second(row$start_time),2, pad="0"),sep =":"),
|
||||
"]"), NA))
|
||||
}
|
||||
|
||||
} else if (day_segments_type == "INTERVAL_EVERY_DAY"){
|
||||
|
||||
data_dates <- data %>% select(local_date) %>% distinct(local_date)
|
||||
inferred_day_segments <- crossing(day_segments, data_dates) %>%
|
||||
mutate(start_local_date_time_obj = lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = fixed_timezone),
|
||||
end_local_date_time_obj = start_local_date_time_obj + lubridate::period(length),
|
||||
date_time_interval = lubridate::interval(start_local_date_time_obj, end_local_date_time_obj)) %>%
|
||||
group_by(label, local_date) %>%
|
||||
mutate(group_start_datetime = lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = fixed_timezone),
|
||||
group_end_datetime = group_start_datetime + lubridate::period(length),
|
||||
group_start_datetime = min(group_start_datetime),
|
||||
group_end_datetime = max(group_end_datetime)) %>%
|
||||
ungroup()
|
||||
|
||||
|
||||
data <- data %>% mutate(local_date_time_obj = lubridate::ymd_hms(local_date_time, tz = fixed_timezone))
|
||||
|
||||
# Create a new column for each day_segment
|
||||
for(row_id in 1:nrow(inferred_day_segments)){
|
||||
row = inferred_day_segments[row_id,]
|
||||
data <- data %>% mutate(!!paste("local_day_segment", row_id, sep = "_") := ifelse(local_date_time_obj %within% row$date_time_interval,
|
||||
paste0("[",
|
||||
paste(sep= "#",
|
||||
row$label,
|
||||
lubridate::date(row$group_start_datetime),
|
||||
paste(str_pad(hour(row$group_start_datetime),2, pad="0"), str_pad(minute(row$group_start_datetime),2, pad="0"), str_pad(second(row$group_start_datetime),2, pad="0"),sep =":"),
|
||||
lubridate::date(row$group_end_datetime),
|
||||
paste(str_pad(hour(row$group_end_datetime),2, pad="0"), str_pad(minute(row$group_end_datetime),2, pad="0"), str_pad(second(row$group_end_datetime),2, pad="0"),sep =":")
|
||||
),
|
||||
"]"), NA))
|
||||
}
|
||||
|
||||
|
||||
} else if ( day_segments_type == "INTERVAL_FLEXIBLE_DAY"){
|
||||
data <- data %>% mutate(local_date_time_obj = lubridate::ymd_hms(local_date_time, tz = fixed_timezone))
|
||||
day_segments <- day_segments %>% mutate(shift = ifelse(shift == "0", "0seconds", shift),
|
||||
start_local_date_time_obj = lubridate::ymd_hms(start_date_time, tz = fixed_timezone) + (lubridate::period(shift) * ifelse(shift_direction >= 0, 1, -1)),
|
||||
end_local_date_time_obj = start_local_date_time_obj + lubridate::period(length),
|
||||
date_time_interval = lubridate::interval(start_local_date_time_obj, end_local_date_time_obj))
|
||||
|
||||
# Create a new column for each day_segment
|
||||
for(row_id in 1:nrow(day_segments)){
|
||||
row = day_segments[row_id,]
|
||||
print(row$length)
|
||||
data <- data %>% mutate(!!paste("local_day_segment", row_id, sep = "_") := ifelse(local_date_time_obj %within% row$date_time_interval,
|
||||
paste0("[",
|
||||
paste(sep= "#",
|
||||
row$label,
|
||||
lubridate::date(row$start_local_date_time_obj),
|
||||
paste(str_pad(hour(row$start_local_date_time_obj),2, pad="0"), str_pad(minute(row$start_local_date_time_obj),2, pad="0"), str_pad(second(row$start_local_date_time_obj),2, pad="0"),sep =":"),
|
||||
lubridate::date(row$end_local_date_time_obj),
|
||||
paste(str_pad(hour(row$end_local_date_time_obj),2, pad="0"), str_pad(minute(row$end_local_date_time_obj),2, pad="0"), str_pad(second(row$end_local_date_time_obj),2, pad="0"),sep =":")
|
||||
),
|
||||
"]"), NA))
|
||||
}
|
||||
}
|
||||
|
||||
# Join all day_segments in a single column
|
||||
data <- data %>%
|
||||
unite("assigned_segments", starts_with("local_day_segment"), sep = "|", na.rm = TRUE) %>%
|
||||
select(-local_date_time_obj)
|
||||
|
||||
return(data)
|
||||
}
|
||||
|
||||
split_local_date_time <- function(data, day_segments){
|
||||
split_data <- data %>%
|
||||
separate(local_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>%
|
||||
|
|
|
@ -1,59 +0,0 @@
|
|||
source("renv/activate.R")
|
||||
|
||||
library(dplyr)
|
||||
library(readr)
|
||||
library(tidyr)
|
||||
|
||||
bin_size <- snakemake@params[["bin_size"]]
|
||||
timezone <- snakemake@params[["timezone"]]
|
||||
consecutive_threshold <- snakemake@params[["consecutive_threshold"]]
|
||||
time_since_valid_location <- snakemake@params[["time_since_valid_location"]]
|
||||
|
||||
locations <- read_csv(snakemake@input[["locations"]], col_types = cols()) %>% filter(provider == "fused")
|
||||
phone_sensed_bins <- read_csv(snakemake@input[["phone_sensed_bins"]], col_types = cols(local_date = col_character()))
|
||||
|
||||
if(nrow(locations) > 0){
|
||||
sensed_minute_bins <- phone_sensed_bins %>%
|
||||
pivot_longer(-local_date, names_to = c("hour", "bin"), names_sep = "_", values_to = "sensor_count") %>%
|
||||
mutate(hour = as.integer(hour), bin = as.integer(bin)) %>%
|
||||
complete(nesting(local_date, hour), bin = seq(0, 59,1)) %>%
|
||||
fill(sensor_count) %>%
|
||||
mutate(timestamp = as.numeric(as.POSIXct(paste0(local_date, " ", hour,":", bin,":00"), format = "%Y-%m-%d %H:%M:%S", tz = timezone)) * 1000 ) %>%
|
||||
filter(sensor_count > 0) %>%
|
||||
select(timestamp)
|
||||
|
||||
resampled_locations <- locations %>%
|
||||
bind_rows(sensed_minute_bins) %>%
|
||||
mutate(provider = replace_na(provider, "resampled")) %>%
|
||||
arrange(timestamp) %>%
|
||||
# We group and therefore, fill in, missing rows that appear after a valid fused location record and exist
|
||||
# within consecutive_threshold minutes from each other
|
||||
mutate(consecutive_time_diff = c(1, diff(timestamp)),
|
||||
resample_group = cumsum(!is.na(double_longitude) | consecutive_time_diff > (1000 * 60 * consecutive_threshold))) %>%
|
||||
group_by(resample_group) %>%
|
||||
# drop rows that are logged after time_since_valid_location minutes from the last valid fused location
|
||||
filter((timestamp - first(timestamp) < (1000 * 60 * time_since_valid_location))) %>%
|
||||
fill(-timestamp, -resample_group) %>%
|
||||
select(-consecutive_time_diff) %>%
|
||||
drop_na(double_longitude, double_latitude, accuracy) %>%
|
||||
# Add local date_time
|
||||
mutate(utc_date_time = as.POSIXct(timestamp/1000, origin="1970-01-01", tz="UTC"),
|
||||
local_date_time = format(utc_date_time, tz = timezone, usetz = F)) %>%
|
||||
separate(local_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>%
|
||||
separate(local_time, c("local_hour", "local_minute"), ":", remove = FALSE, extra = "drop") %>%
|
||||
mutate(local_hour = as.numeric(local_hour),
|
||||
local_minute = as.numeric(local_minute),
|
||||
local_day_segment = case_when(local_hour %in% 0:5 ~ "night",
|
||||
local_hour %in% 6:11 ~ "morning",
|
||||
local_hour %in% 12:17 ~ "afternoon",
|
||||
local_hour %in% 18:23 ~ "evening")) %>%
|
||||
# Delete resampled rows that exist in the same minute as other original (fused) rows
|
||||
group_by(local_date, local_hour, local_minute) %>%
|
||||
mutate(n = n()) %>%
|
||||
filter(n == 1 | (n > 1 & provider == "fused")) %>%
|
||||
select(-n)
|
||||
|
||||
write.csv(resampled_locations,snakemake@output[[1]], row.names = F)
|
||||
} else {
|
||||
write.csv(locations,snakemake@output[[1]], row.names = F)
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
source("renv/activate.R")
|
||||
|
||||
library("tidyr")
|
||||
library("dplyr")
|
||||
|
||||
location_features_files <- snakemake@input[["location_features"]]
|
||||
location_features <- setNames(data.frame(matrix(ncol = 1, nrow = 0)), c("local_segment"))
|
||||
|
||||
|
||||
for(location_features_file in location_features_files){
|
||||
location_features <- merge(location_features, read.csv(location_features_file), all = TRUE)
|
||||
}
|
||||
|
||||
write.csv(location_features, snakemake@output[[1]], row.names = FALSE)
|
|
@ -0,0 +1,96 @@
|
|||
source("renv/activate.R")
|
||||
library("dplyr")
|
||||
library("stringr")
|
||||
|
||||
# Load Ian Barnett's code. Taken from https://scholar.harvard.edu/ibarnett/software/gpsmobility
|
||||
file.sources = list.files(c("src/features/location/barnett/library"), pattern="*.R$", full.names=TRUE, ignore.case=TRUE)
|
||||
sapply(file.sources,source,.GlobalEnv)
|
||||
|
||||
create_empty_file <- function(requested_features){
|
||||
return(data.frame(local_segment= character(),
|
||||
locations_barnett_hometime= numeric(),
|
||||
locations_barnett_disttravelled= numeric(),
|
||||
locations_barnett_rog= numeric(),
|
||||
locations_barnett_maxdiam= numeric(),
|
||||
locations_barnett_maxhomedist= numeric(),
|
||||
locations_barnett_siglocsvisited= numeric(),
|
||||
locations_barnett_avgflightlen= numeric(),
|
||||
locations_barnett_stdflightlen= numeric(),
|
||||
locations_barnett_avgflightdur= numeric(),
|
||||
locations_barnett_stdflightdur= numeric(),
|
||||
locations_barnett_probpause= numeric(),
|
||||
locations_barnett_siglocentropy= numeric(),
|
||||
locations_barnett_minsmissing= numeric(),
|
||||
locations_barnett_circdnrtn= numeric(),
|
||||
locations_barnett_wkenddayrtn= numeric(),
|
||||
locations_barnett_minutes_data_used= numeric()
|
||||
) %>% select(all_of(requested_features)))
|
||||
}
|
||||
|
||||
barnett_location_features <- function(location_data, day_segment, params){
|
||||
location_features <- NULL
|
||||
location <- location_data
|
||||
accuracy_limit <- params[["ACCURACY_LIMIT"]]
|
||||
timezone <- params[["TIMEZONE"]]
|
||||
minutes_data_used <- params[["MINUTES_DATA_USED"]]
|
||||
|
||||
# Compute what features were requested
|
||||
available_features <- c("hometime","disttravelled","rog","maxdiam", "maxhomedist","siglocsvisited","avgflightlen", "stdflightlen",
|
||||
"avgflightdur","stdflightdur", "probpause","siglocentropy","minsmissing", "circdnrtn","wkenddayrtn")
|
||||
requested_features <- intersect(unlist(params["FEATURES"], use.names = F), available_features)
|
||||
requested_features <- c("local_segment", paste("locations_barnett", requested_features, sep = "_"))
|
||||
if(minutes_data_used)
|
||||
requested_features <- c(requested_features, "locations_barnett_minutes_data_used")
|
||||
|
||||
# Excludes datasets with less than 24 hours of data
|
||||
if(max(location$timestamp) - min(location$timestamp) < 86400000)
|
||||
location <- head(location, 0)
|
||||
|
||||
if (nrow(location) > 1){
|
||||
# Filter by segment and skipping any non-daily segment
|
||||
location <- location %>% filter_data_by_segment(day_segment)
|
||||
segment <- location %>% head(1) %>% pull(local_segment)
|
||||
segment_data <- str_split(segment, "#")[[1]]
|
||||
if(segment_data[[2]] != segment_data[[4]] || segment_data[[3]] != "00:00:00" || segment_data[[5]] != "23:59:59"){
|
||||
warning(paste("Barnett's location features cannot be computed for day segmentes that are not daily (cover 00:00:00 to 23:59:59 of every day). Skipping for ", segment))
|
||||
location_features <- create_empty_file(requested_features)
|
||||
} else {
|
||||
# Count how many minutes of data we use to get location features
|
||||
# Some minutes have multiple fused rows
|
||||
location_minutes_used <- location %>%
|
||||
group_by(local_date, local_hour) %>%
|
||||
summarise(n_minutes = n_distinct(local_minute)) %>%
|
||||
group_by(local_date) %>%
|
||||
summarise(locations_barnett_minutes_data_used = sum(n_minutes)) %>%
|
||||
select(local_date, locations_barnett_minutes_data_used)
|
||||
|
||||
# Save day segment to attach it later
|
||||
location_dates_segments <- location %>% select(local_date, local_segment) %>% distinct(local_date, .keep_all = TRUE)
|
||||
|
||||
# Select only the columns that the algorithm needs
|
||||
location <- location %>% select(timestamp, latitude = double_latitude, longitude = double_longitude, altitude = double_altitude, accuracy)
|
||||
outputMobility <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone)
|
||||
|
||||
if(is.null(outputMobility)){
|
||||
location_features <- create_empty_file(requested_features)
|
||||
} else{
|
||||
# Copy index (dates) as a column
|
||||
features <- cbind(rownames(outputMobility$featavg), outputMobility$featavg)
|
||||
features <- as.data.frame(features)
|
||||
features[-1] <- lapply(lapply(features[-1], as.character), as.numeric)
|
||||
colnames(features)=c("local_date",tolower(paste("locations_barnett", colnames(outputMobility$featavg), sep = "_")))
|
||||
# Add the minute count column
|
||||
features <- left_join(features, location_minutes_used, by = "local_date")
|
||||
# Add the day segment column for consistency
|
||||
features <- left_join(features, location_dates_segments, by = "local_date")
|
||||
location_features <- features %>% select(all_of(requested_features))
|
||||
}
|
||||
}
|
||||
} else {
|
||||
location_features <- create_empty_file(requested_features)
|
||||
}
|
||||
|
||||
if(ncol(location_features) != length(requested_features))
|
||||
stop(paste0("The number of features in the output dataframe (=", ncol(location_features),") does not match the expected value (=", length(requested_features),"). Verify your barnett location features"))
|
||||
return(location_features)
|
||||
}
|
|
@ -4,21 +4,32 @@ from astropy.timeseries import LombScargle
|
|||
from sklearn.cluster import DBSCAN
|
||||
from math import radians, cos, sin, asin, sqrt
|
||||
|
||||
def base_location_features(location_data, day_segment, requested_features, dbscan_eps, dbscan_minsamples, threshold_static, maximum_gap_allowed,sampling_frequency):
|
||||
def doryab_location_features(location_data, day_segment, params, filter_data_by_segment, *args, **kwargs):
|
||||
requested_features = params["FEATURES"]
|
||||
dbscan_eps = params["DBSCAN_EPS"]
|
||||
dbscan_minsamples = params["DBSCAN_MINSAMPLES"]
|
||||
threshold_static = params["THRESHOLD_STATIC"]
|
||||
maximum_gap_allowed = params["MAXIMUM_GAP_ALLOWED"]
|
||||
sampling_frequency = params["SAMPLING_FREQUENCY"]
|
||||
|
||||
minutes_data_used = params["MINUTES_DATA_USED"]
|
||||
if(minutes_data_used):
|
||||
requested_features.append("minutesdataused")
|
||||
|
||||
# name of the features this function can compute
|
||||
base_features_names = ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","minutesdataused"]
|
||||
# the subset of requested features this function can compute
|
||||
features_to_compute = list(set(requested_features) & set(base_features_names))
|
||||
|
||||
|
||||
|
||||
if location_data.empty:
|
||||
location_features = pd.DataFrame(columns=["local_date"] + ["location_" + day_segment + "_" + x for x in features_to_compute])
|
||||
location_features = pd.DataFrame(columns=["local_segment"] + ["locations_doryab_" + x for x in features_to_compute])
|
||||
else:
|
||||
if day_segment != "daily":
|
||||
location_data = location_data[location_data["local_day_segment"] == day_segment]
|
||||
location_data = filter_data_by_segment(location_data, day_segment)
|
||||
|
||||
if location_data.empty:
|
||||
location_features = pd.DataFrame(columns=["local_date"] + ["location_" + day_segment + "_" + x for x in features_to_compute])
|
||||
location_features = pd.DataFrame(columns=["local_segment"] + ["locations_doryab_" + x for x in features_to_compute])
|
||||
else:
|
||||
location_features = pd.DataFrame()
|
||||
|
||||
|
@ -26,108 +37,108 @@ def base_location_features(location_data, day_segment, requested_features, dbsca
|
|||
sampling_frequency = getSamplingFrequency(location_data)
|
||||
|
||||
if "minutesdataused" in features_to_compute:
|
||||
for localDate in location_data["local_date"].unique():
|
||||
location_features.loc[localDate,"location_" + day_segment + "_minutesdataused"] = getMinutesData(location_data[location_data["local_date"]==localDate])
|
||||
for localDate in location_data["local_segment"].unique():
|
||||
location_features.loc[localDate,"locations_doryab_minutesdataused"] = getMinutesData(location_data[location_data["local_segment"]==localDate])
|
||||
|
||||
location_features.index.name = 'local_date'
|
||||
location_features.index.name = 'local_segment'
|
||||
|
||||
location_data = location_data[(location_data['double_latitude']!=0.0) & (location_data['double_longitude']!=0.0)]
|
||||
|
||||
if "locationvariance" in features_to_compute:
|
||||
location_features["location_" + day_segment + "_locationvariance"] = location_data.groupby(['local_date'])['double_latitude'].var() + location_data.groupby(['local_date'])['double_longitude'].var()
|
||||
location_features["locations_doryab_locationvariance"] = location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var()
|
||||
|
||||
if "loglocationvariance" in features_to_compute:
|
||||
location_features["location_" + day_segment + "_loglocationvariance"] = (location_data.groupby(['local_date'])['double_latitude'].var() + location_data.groupby(['local_date'])['double_longitude'].var()).apply(lambda x: np.log10(x) if x > 0 else None)
|
||||
location_features["locations_doryab_loglocationvariance"] = (location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var()).apply(lambda x: np.log10(x) if x > 0 else None)
|
||||
|
||||
|
||||
preComputedDistanceandSpeed = pd.DataFrame()
|
||||
for localDate in location_data['local_date'].unique():
|
||||
distance, speeddf = get_all_travel_distances_meters_speed(location_data[location_data['local_date']==localDate],threshold_static,maximum_gap_allowed)
|
||||
for localDate in location_data['local_segment'].unique():
|
||||
distance, speeddf = get_all_travel_distances_meters_speed(location_data[location_data['local_segment']==localDate],threshold_static,maximum_gap_allowed)
|
||||
preComputedDistanceandSpeed.loc[localDate,"distance"] = distance.sum()
|
||||
preComputedDistanceandSpeed.loc[localDate,"avgspeed"] = speeddf[speeddf['speedTag'] == 'Moving']['speed'].mean()
|
||||
preComputedDistanceandSpeed.loc[localDate,"varspeed"] = speeddf[speeddf['speedTag'] == 'Moving']['speed'].var()
|
||||
|
||||
if "totaldistance" in features_to_compute:
|
||||
for localDate in location_data['local_date'].unique():
|
||||
location_features.loc[localDate,"location_" + day_segment + "_totaldistance"] = preComputedDistanceandSpeed.loc[localDate,"distance"]
|
||||
for localDate in location_data['local_segment'].unique():
|
||||
location_features.loc[localDate,"locations_doryab_totaldistance"] = preComputedDistanceandSpeed.loc[localDate,"distance"]
|
||||
|
||||
if "averagespeed" in features_to_compute:
|
||||
for localDate in location_data['local_date'].unique():
|
||||
location_features.loc[localDate,"location_" + day_segment + "_averagespeed"] = preComputedDistanceandSpeed.loc[localDate,"avgspeed"]
|
||||
for localDate in location_data['local_segment'].unique():
|
||||
location_features.loc[localDate,"locations_doryab_averagespeed"] = preComputedDistanceandSpeed.loc[localDate,"avgspeed"]
|
||||
|
||||
if "varspeed" in features_to_compute:
|
||||
for localDate in location_data['local_date'].unique():
|
||||
location_features.loc[localDate,"location_" + day_segment + "_varspeed"] = preComputedDistanceandSpeed.loc[localDate,"varspeed"]
|
||||
for localDate in location_data['local_segment'].unique():
|
||||
location_features.loc[localDate,"locations_doryab_varspeed"] = preComputedDistanceandSpeed.loc[localDate,"varspeed"]
|
||||
|
||||
if "circadianmovement" in features_to_compute:
|
||||
for localDate in location_data['local_date'].unique():
|
||||
location_features.loc[localDate,"location_" + day_segment + "_circadianmovement"] = circadian_movement(location_data[location_data['local_date']==localDate])
|
||||
for localDate in location_data['local_segment'].unique():
|
||||
location_features.loc[localDate,"locations_doryab_circadianmovement"] = circadian_movement(location_data[location_data['local_segment']==localDate])
|
||||
|
||||
newLocationData = cluster_and_label(location_data, eps= distance_to_degrees(dbscan_eps), min_samples=dbscan_minsamples)
|
||||
|
||||
if "numberofsignificantplaces" in features_to_compute:
|
||||
for localDate in newLocationData['local_date'].unique():
|
||||
location_features.loc[localDate,"location_" + day_segment + "_numberofsignificantplaces"] = number_of_significant_places(newLocationData[newLocationData['local_date']==localDate])
|
||||
for localDate in newLocationData['local_segment'].unique():
|
||||
location_features.loc[localDate,"locations_doryab_numberofsignificantplaces"] = number_of_significant_places(newLocationData[newLocationData['local_segment']==localDate])
|
||||
|
||||
if "numberlocationtransitions" in features_to_compute:
|
||||
for localDate in newLocationData['local_date'].unique():
|
||||
location_features.loc[localDate,"location_" + day_segment + "_numberlocationtransitions"] = number_location_transitions(newLocationData[newLocationData['local_date']==localDate])
|
||||
for localDate in newLocationData['local_segment'].unique():
|
||||
location_features.loc[localDate,"locations_doryab_numberlocationtransitions"] = number_location_transitions(newLocationData[newLocationData['local_segment']==localDate])
|
||||
|
||||
if "radiusgyration" in features_to_compute:
|
||||
for localDate in newLocationData['local_date'].unique():
|
||||
location_features.loc[localDate,"location_" + day_segment + "_radiusgyration"] = radius_of_gyration(newLocationData[newLocationData['local_date']==localDate],sampling_frequency)
|
||||
for localDate in newLocationData['local_segment'].unique():
|
||||
location_features.loc[localDate,"locations_doryab_radiusgyration"] = radius_of_gyration(newLocationData[newLocationData['local_segment']==localDate],sampling_frequency)
|
||||
|
||||
if "timeattop1location" in features_to_compute:
|
||||
for localDate in newLocationData['local_date'].unique():
|
||||
location_features.loc[localDate,"location_" + day_segment + "_timeattop1"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_date']==localDate],1,sampling_frequency)
|
||||
for localDate in newLocationData['local_segment'].unique():
|
||||
location_features.loc[localDate,"locations_doryab_timeattop1"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_segment']==localDate],1,sampling_frequency)
|
||||
|
||||
if "timeattop2location" in features_to_compute:
|
||||
for localDate in newLocationData['local_date'].unique():
|
||||
location_features.loc[localDate,"location_" + day_segment + "_timeattop2"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_date']==localDate],2,sampling_frequency)
|
||||
for localDate in newLocationData['local_segment'].unique():
|
||||
location_features.loc[localDate,"locations_doryab_timeattop2"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_segment']==localDate],2,sampling_frequency)
|
||||
|
||||
if "timeattop3location" in features_to_compute:
|
||||
for localDate in newLocationData['local_date'].unique():
|
||||
location_features.loc[localDate,"location_" + day_segment + "_timeattop3"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_date']==localDate],3,sampling_frequency)
|
||||
for localDate in newLocationData['local_segment'].unique():
|
||||
location_features.loc[localDate,"locations_doryab_timeattop3"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_segment']==localDate],3,sampling_frequency)
|
||||
|
||||
if "movingtostaticratio" in features_to_compute:
|
||||
for localDate in newLocationData['local_date'].unique():
|
||||
location_features.loc[localDate,"location_" + day_segment + "_movingtostaticratio"] = (newLocationData[newLocationData['local_date']==localDate].shape[0]*sampling_frequency) / (location_data[location_data['local_date']==localDate].shape[0] * sampling_frequency)
|
||||
for localDate in newLocationData['local_segment'].unique():
|
||||
location_features.loc[localDate,"locations_doryab_movingtostaticratio"] = (newLocationData[newLocationData['local_segment']==localDate].shape[0]*sampling_frequency) / (location_data[location_data['local_segment']==localDate].shape[0] * sampling_frequency)
|
||||
|
||||
if "outlierstimepercent" in features_to_compute:
|
||||
for localDate in newLocationData['local_date'].unique():
|
||||
location_features.loc[localDate,"location_" + day_segment + "_outlierstimepercent"] = outliers_time_percent(newLocationData[newLocationData['local_date']==localDate],sampling_frequency)
|
||||
for localDate in newLocationData['local_segment'].unique():
|
||||
location_features.loc[localDate,"locations_doryab_outlierstimepercent"] = outliers_time_percent(newLocationData[newLocationData['local_segment']==localDate],sampling_frequency)
|
||||
|
||||
preComputedmaxminCluster = pd.DataFrame()
|
||||
for localDate in newLocationData['local_date'].unique():
|
||||
smax, smin, sstd,smean = len_stay_at_clusters_in_minutes(newLocationData[newLocationData['local_date']==localDate],sampling_frequency)
|
||||
preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_maxlengthstayatclusters"] = smax
|
||||
preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_minlengthstayatclusters"] = smin
|
||||
preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_stdlengthstayatclusters"] = sstd
|
||||
preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_meanlengthstayatclusters"] = smean
|
||||
for localDate in newLocationData['local_segment'].unique():
|
||||
smax, smin, sstd,smean = len_stay_at_clusters_in_minutes(newLocationData[newLocationData['local_segment']==localDate],sampling_frequency)
|
||||
preComputedmaxminCluster.loc[localDate,"locations_doryab_maxlengthstayatclusters"] = smax
|
||||
preComputedmaxminCluster.loc[localDate,"locations_doryab_minlengthstayatclusters"] = smin
|
||||
preComputedmaxminCluster.loc[localDate,"locations_doryab_stdlengthstayatclusters"] = sstd
|
||||
preComputedmaxminCluster.loc[localDate,"locations_doryab_meanlengthstayatclusters"] = smean
|
||||
|
||||
if "maxlengthstayatclusters" in features_to_compute:
|
||||
for localDate in newLocationData['local_date'].unique():
|
||||
location_features.loc[localDate,"location_" + day_segment + "_maxlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_maxlengthstayatclusters"]
|
||||
for localDate in newLocationData['local_segment'].unique():
|
||||
location_features.loc[localDate,"locations_doryab_maxlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"locations_doryab_maxlengthstayatclusters"]
|
||||
|
||||
if "minlengthstayatclusters" in features_to_compute:
|
||||
for localDate in newLocationData['local_date'].unique():
|
||||
location_features.loc[localDate,"location_" + day_segment + "_minlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_minlengthstayatclusters"]
|
||||
for localDate in newLocationData['local_segment'].unique():
|
||||
location_features.loc[localDate,"locations_doryab_minlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"locations_doryab_minlengthstayatclusters"]
|
||||
|
||||
if "stdlengthstayatclusters" in features_to_compute:
|
||||
for localDate in newLocationData['local_date'].unique():
|
||||
location_features.loc[localDate,"location_" + day_segment + "_stdlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_stdlengthstayatclusters"]
|
||||
for localDate in newLocationData['local_segment'].unique():
|
||||
location_features.loc[localDate,"locations_doryab_stdlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"locations_doryab_stdlengthstayatclusters"]
|
||||
|
||||
if "meanlengthstayatclusters" in features_to_compute:
|
||||
for localDate in newLocationData['local_date'].unique():
|
||||
location_features.loc[localDate,"location_" + day_segment + "_meanlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_meanlengthstayatclusters"]
|
||||
for localDate in newLocationData['local_segment'].unique():
|
||||
location_features.loc[localDate,"locations_doryab_meanlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"locations_doryab_meanlengthstayatclusters"]
|
||||
|
||||
if "locationentropy" in features_to_compute:
|
||||
for localDate in newLocationData['local_date'].unique():
|
||||
location_features.loc[localDate,"location_" + day_segment + "_locationentropy"] = location_entropy(newLocationData[newLocationData['local_date']==localDate])
|
||||
for localDate in newLocationData['local_segment'].unique():
|
||||
location_features.loc[localDate,"locations_doryab_locationentropy"] = location_entropy(newLocationData[newLocationData['local_segment']==localDate])
|
||||
|
||||
if "normalizedlocationentropy" in features_to_compute:
|
||||
for localDate in newLocationData['local_date'].unique():
|
||||
location_features.loc[localDate,"location_" + day_segment + "_normalizedlocationentropy"] = location_entropy_normalized(newLocationData[newLocationData['local_date']==localDate])
|
||||
for localDate in newLocationData['local_segment'].unique():
|
||||
location_features.loc[localDate,"locations_doryab_normalizedlocationentropy"] = location_entropy_normalized(newLocationData[newLocationData['local_segment']==localDate])
|
||||
|
||||
location_features = location_features.reset_index()
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
source("renv/activate.R")
|
||||
source("src/features/utils/utils.R")
|
||||
library("dplyr")
|
||||
library("stringr")
|
||||
library("tidyr")
|
||||
|
||||
location_data <- read.csv(snakemake@input[["location_data"]], stringsAsFactors = FALSE)
|
||||
day_segments_labels <- read.csv(snakemake@input[["day_segments_labels"]], stringsAsFactors = FALSE)
|
||||
provider <- snakemake@params["provider"][["provider"]]
|
||||
provider_key <- snakemake@params["provider_key"]
|
||||
|
||||
location_features <- data.frame(local_segment = character(), stringsAsFactors = FALSE)
|
||||
|
||||
if(!"FEATURES" %in% names(provider))
|
||||
stop(paste0("Provider config[LOCATION][PROVIDERS][", provider_key,"] is missing a FEATURES attribute in config.yaml"))
|
||||
|
||||
if(provider[["COMPUTE"]] == TRUE){
|
||||
code_path <- paste0("src/features/location/", provider[["SRC_FOLDER"]], "/main.R")
|
||||
source(code_path)
|
||||
features_function <- match.fun(paste0(provider[["SRC_FOLDER"]], "_location_features"))
|
||||
day_segments <- day_segments_labels %>% pull(label)
|
||||
for (day_segment in day_segments){
|
||||
print(paste(rapids_log_tag,"Processing", provider_key, day_segment))
|
||||
|
||||
features <- features_function(location_data, day_segment, provider)
|
||||
|
||||
# Check all features names contain the provider key so they are unique
|
||||
features_names <- colnames(features %>% select(-local_segment))
|
||||
if(any(!grepl(paste0(".*(",str_to_lower(provider_key),").*"), features_names)))
|
||||
stop(paste("The name of all location features of", provider_key," must contain its name in lower case but the following don't [", paste(features_names[!grepl(paste0(".*(",str_to_lower(provider_key),").*"), features_names)], collapse = ", "), "]"))
|
||||
|
||||
location_features <- merge(location_features, features, all = TRUE)
|
||||
}
|
||||
} else {
|
||||
for(feature in provider[["FEATURES"]])
|
||||
location_features[,feature] <- NA
|
||||
}
|
||||
|
||||
location_features <- location_features %>% separate(col = local_segment,
|
||||
into = c("local_segment_label", "local_start_date", "local_start_time", "local_end_date", "local_end_time"),
|
||||
sep = "#",
|
||||
remove = FALSE)
|
||||
|
||||
write.csv(location_features, snakemake@output[[1]], row.names = FALSE)
|
|
@ -0,0 +1,39 @@
|
|||
import pandas as pd
|
||||
from importlib import import_module, util
|
||||
from pathlib import Path
|
||||
|
||||
# import filter_data_by_segment from src/features/utils/utils.py
|
||||
spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py"))
|
||||
mod = util.module_from_spec(spec)
|
||||
spec.loader.exec_module(mod)
|
||||
filter_data_by_segment = getattr(mod, "filter_data_by_segment")
|
||||
rapids_log_tag = getattr(mod, "rapids_log_tag")
|
||||
|
||||
location_data = pd.read_csv(snakemake.input["location_data"][0])
|
||||
day_segments_labels = pd.read_csv(snakemake.input["day_segments_labels"], header=0)
|
||||
mypath = snakemake.params["mypath"]
|
||||
provider = snakemake.params["provider"]
|
||||
provider_key = snakemake.params["provider_key"]
|
||||
location_features = pd.DataFrame(columns=["local_segment"])
|
||||
|
||||
if "FEATURES" not in provider:
|
||||
raise ValueError("Provider config[LOCATION][PROVIDERS][{}] is missing a FEATURES attribute in config.yaml".format(provider_key))
|
||||
|
||||
if provider["COMPUTE"] == True:
|
||||
code_path = provider["SRC_FOLDER"] + ".main"
|
||||
feature_module = import_module(code_path)
|
||||
feature_function = getattr(feature_module, provider["SRC_FOLDER"] + "_location_features")
|
||||
|
||||
for day_segment in day_segments_labels["label"]:
|
||||
print("{} Processing {} {}".format(rapids_log_tag, provider_key, day_segment))
|
||||
features = feature_function(location_data, day_segment, provider, filter_data_by_segment=filter_data_by_segment)
|
||||
location_features = location_features.merge(features, how="outer")
|
||||
else:
|
||||
for feature in provider["FEATURES"]:
|
||||
location_features[feature] = None
|
||||
|
||||
segment_colums = pd.DataFrame()
|
||||
segment_colums[["local_segment_label", "local_start_date", "local_start_time", "local_end_date", "local_end_time"]] = location_features["local_segment"].str.split(pat="#", expand=True)
|
||||
for i in range(segment_colums.shape[1]):
|
||||
location_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]])
|
||||
location_features.to_csv(snakemake.output[0], index=False)
|
|
@ -1,89 +0,0 @@
|
|||
source("renv/activate.R")
|
||||
# Load Ian Barnett's code. Taken from https://scholar.harvard.edu/ibarnett/software/gpsmobility
|
||||
file.sources = list.files(c("src/features/location_barnett"), pattern="*.R$", full.names=TRUE, ignore.case=TRUE)
|
||||
sapply(file.sources,source,.GlobalEnv)
|
||||
|
||||
library(dplyr)
|
||||
|
||||
write_empty_file <- function(file_path, requested_features){
|
||||
write.csv(data.frame(local_date= character(),
|
||||
location_barnett_hometime= numeric(),
|
||||
location_barnett_disttravelled= numeric(),
|
||||
location_barnett_rog= numeric(),
|
||||
location_barnett_maxdiam= numeric(),
|
||||
location_barnett_maxhomedist= numeric(),
|
||||
location_barnett_siglocsvisited= numeric(),
|
||||
location_barnett_avgflightlen= numeric(),
|
||||
location_barnett_stdflightlen= numeric(),
|
||||
location_barnett_avgflightdur= numeric(),
|
||||
location_barnett_stdflightdur= numeric(),
|
||||
location_barnett_probpause= numeric(),
|
||||
location_barnett_siglocentropy= numeric(),
|
||||
location_barnett_minsmissing= numeric(),
|
||||
location_barnett_circdnrtn= numeric(),
|
||||
location_barnett_wkenddayrtn= numeric(),
|
||||
minutes_data_used= numeric()
|
||||
) %>% select(requested_features), file_path, row.names = F)
|
||||
}
|
||||
|
||||
location <- read.csv(snakemake@input[["locations"]], stringsAsFactors = F)
|
||||
# The choice between RESAMPLE_FUSED and the original location data happens at the rule level in the function
|
||||
# optional_location_input in features.snakefile
|
||||
locations_to_use <- snakemake@params[["locations_to_use"]]
|
||||
accuracy_limit <- snakemake@params[["accuracy_limit"]]
|
||||
timezone <- snakemake@params[["timezone"]]
|
||||
minutes_data_used <- snakemake@params[["minutes_data_used"]]
|
||||
requested_features <- intersect(unlist(snakemake@params["features"], use.names = F),
|
||||
c("hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","minsmissing","circdnrtn","wkenddayrtn"))
|
||||
requested_features <- c("local_date", paste("location_barnett", requested_features, sep = "_"))
|
||||
if(minutes_data_used)
|
||||
requested_features <- c(requested_features, "minutes_data_used")
|
||||
|
||||
if(!locations_to_use %in% c("ALL_EXCEPT_FUSED", "RESAMPLE_FUSED", "ALL")){
|
||||
print("Unkown filter, provide one of the following three: ALL, ALL_EXCEPT_FUSED, or RESAMPLE_FUSED")
|
||||
quit(save = "no", status = 1, runLast = FALSE)
|
||||
}
|
||||
|
||||
# excludes fused and resample
|
||||
if(locations_to_use == "ALL_EXCEPT_FUSED")
|
||||
location <- location %>% filter(provider == "gps")
|
||||
|
||||
# Remove 0,0 location coordinates
|
||||
location <- location %>% filter(double_latitude != 0 & double_longitude != 0)
|
||||
|
||||
# Excludes datasets with less than 24 hours of data
|
||||
if(max(location$timestamp) - min(location$timestamp) < 86400000)
|
||||
location <- head(location, 0)
|
||||
|
||||
if (nrow(location) > 1){
|
||||
|
||||
# Count how many minutes of data we use to get location features
|
||||
# Some minutes have multiple fused rows
|
||||
location_minutes_used <- location %>%
|
||||
group_by(local_date, local_hour) %>%
|
||||
summarise(n_minutes = n_distinct(local_minute)) %>%
|
||||
group_by(local_date) %>%
|
||||
summarise(minutes_data_used = sum(n_minutes)) %>%
|
||||
select(local_date, minutes_data_used)
|
||||
|
||||
location <- location %>%
|
||||
select(timestamp, latitude = double_latitude, longitude = double_longitude, altitude = double_altitude, accuracy)
|
||||
|
||||
outputMobility <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone)
|
||||
|
||||
if(is.null(outputMobility)){
|
||||
write_empty_file(snakemake@output[[1]], requested_features)
|
||||
} else{
|
||||
# Copy index (dates) as a column
|
||||
features <- cbind(rownames(outputMobility$featavg), outputMobility$featavg)
|
||||
features <- as.data.frame(features)
|
||||
features[-1] <- lapply(lapply(features[-1], as.character), as.numeric)
|
||||
colnames(features)=c("local_date",tolower(paste("location_barnett", colnames(outputMobility$featavg), sep = "_")))
|
||||
# Add the minute count column
|
||||
features <- left_join(features, location_minutes_used, by = "local_date")
|
||||
write.csv(features %>% select(requested_features), snakemake@output[[1]], row.names = F)
|
||||
}
|
||||
|
||||
} else {
|
||||
write_empty_file(snakemake@output[[1]], requested_features)
|
||||
}
|
|
@ -1,24 +0,0 @@
|
|||
import pandas as pd
|
||||
from location_doryab.location_base import base_location_features
|
||||
|
||||
location_data = pd.read_csv(snakemake.input[0], parse_dates=["local_date_time", "local_date"])
|
||||
day_segment = snakemake.params["day_segment"]
|
||||
requested_features = snakemake.params["features"]
|
||||
location_features = pd.DataFrame(columns=["local_date"])
|
||||
dbscan_eps = snakemake.params["dbscan_eps"]
|
||||
dbscan_minsamples = snakemake.params["dbscan_minsamples"]
|
||||
threshold_static = snakemake.params["threshold_static"]
|
||||
maximum_gap_allowed = snakemake.params["maximum_gap_allowed"]
|
||||
minutes_data_used = snakemake.params["minutes_data_used"]
|
||||
sampling_frequency = snakemake.params["sampling_frequency"]
|
||||
|
||||
if(minutes_data_used):
|
||||
requested_features.append("minutesdataused")
|
||||
|
||||
base_features = base_location_features(location_data, day_segment, requested_features, dbscan_eps, dbscan_minsamples,threshold_static,maximum_gap_allowed,sampling_frequency)
|
||||
|
||||
location_features = location_features.merge(base_features, on="local_date", how="outer")
|
||||
|
||||
assert len(requested_features) + 1 == location_features.shape[1], "The number of features in the output dataframe (=" + str(location_features.shape[1]) + ") does not match the expected value (=" + str(len(requested_features)) + " + 1). Verify your location feature extraction functions"
|
||||
|
||||
location_features.to_csv(snakemake.output[0], index=False)
|
|
@ -0,0 +1,12 @@
|
|||
library("stringr")
|
||||
filter_data_by_segment <- function(data, day_segment){
|
||||
# Filter the rows that belong to day_segment, and put the segment full name in a new column for grouping
|
||||
date_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2}"
|
||||
hour_regex = "[0-9]{2}:[0-9]{2}:[0-9]{2}"
|
||||
data <- data %>%
|
||||
filter(grepl(paste0("\\[", day_segment, "#"), assigned_segments)) %>%
|
||||
mutate(local_segment = str_extract(assigned_segments, paste0("\\[", day_segment, "#", date_regex, "#", hour_regex, "#", date_regex, "#", hour_regex, "\\]")),
|
||||
local_segment = str_sub(local_segment, 2, -2)) # get rid of first and last character([])
|
||||
return(data)
|
||||
}
|
||||
rapids_log_tag <- "RAPIDS:"
|
|
@ -0,0 +1,9 @@
|
|||
|
||||
def filter_data_by_segment(data, day_segment):
|
||||
date_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2}"
|
||||
hour_regex = "[0-9]{2}:[0-9]{2}:[0-9]{2}"
|
||||
segment_regex = "\[({}#{}#{}#{}#{})\]".format(day_segment, date_regex, hour_regex, date_regex, hour_regex)
|
||||
data["local_segment"] = data["assigned_segments"].str.extract(segment_regex, expand=True)
|
||||
return(data.dropna(subset = ["local_segment"]))
|
||||
|
||||
rapids_log_tag = "RAPIDS:"
|
Loading…
Reference in New Issue