From b0f1477d7e6c7292b645e9ff17e9728c1173144b Mon Sep 17 00:00:00 2001 From: JulioV Date: Fri, 28 Aug 2020 13:53:00 -0400 Subject: [PATCH] Migrate location providers to new file structure and segments --- Snakefile | 34 ++--- config.yaml | 49 ++++---- docs/features/extracted.rst | 40 +++--- rules/common.smk | 34 +---- rules/features.smk | 44 +++---- rules/preprocessing.smk | 33 ++--- src/data/assign_to_day_segment.R | 85 +++++++++++++ src/data/process_location_types.R | 76 +++++++++++ src/data/readable_datetime.R | 87 +------------ src/data/resample_fused_location.R | 59 --------- src/features/join_features_from_providers.R | 14 +++ .../barnett/library}/AvgFlightDur.R | 0 .../barnett/library}/AvgFlightLen.R | 0 .../barnett/library}/Collapse2Pause.R | 0 .../barnett/library}/DailyMobilityPlots.R | 0 .../barnett/library}/DailyRoutineIndex.R | 0 .../barnett/library}/DayDist.R | 0 .../barnett/library}/DistanceTravelled.R | 0 .../barnett/library}/ExtractFlights.R | 0 .../barnett/library}/ExtractTimePeriod.R | 0 .../barnett/library}/GPS2MobMat.R | 0 .../barnett/library}/GPSmobility-internal.R | 0 .../barnett/library}/GetMobilityFeaturesMat.R | 0 .../barnett/library}/GuessPause.R | 0 .../barnett/library}/Hometime.R | 0 .../barnett/library}/InitializeParams.R | 0 .../barnett/library}/IsFlight.R | 0 .../barnett/library}/LatLong2XY.R | 0 .../barnett/library}/LocationAt.R | 0 .../barnett/library}/MaxDiam.R | 0 .../library}/MaxDistBetweenTrajectories.R | 0 .../barnett/library}/MaxHomeDist.R | 0 .../barnett/library}/MaxRadius.R | 0 .../barnett/library}/MinsMissing.R | 0 .../barnett/library}/MobilityFeatures.R | 0 .../barnett/library}/MobmatQualityOK.R | 0 .../barnett/library}/ProbPause.R | 0 .../barnett/library}/ProgressBar.R | 0 .../barnett/library}/RadiusOfGyration.R | 0 .../barnett/library}/RandomBridge.R | 0 .../barnett/library}/SigLocEntropy.R | 0 .../barnett/library}/SigLocs.R | 0 .../barnett/library}/SigLocsVisited.R | 0 .../barnett/library}/SimulateMobilityGaps.R | 0 .../barnett/library}/StdFlightDur.R | 0 .../barnett/library}/StdFlightLen.R | 0 .../library}/WriteSurveyAnswers2File.R | 0 .../barnett/library}/plot.flights.R | 0 .../barnett/library}/plotlimits.R | 0 src/features/location/barnett/main.R | 96 ++++++++++++++ .../doryab/main.py} | 119 ++++++++++-------- src/features/location/locations_entry.R | 44 +++++++ src/features/location/locations_entry.py | 39 ++++++ src/features/location_barnett_features.R | 89 ------------- src/features/location_doryab_features.py | 24 ---- src/features/utils/utils.R | 12 ++ src/features/utils/utils.py | 9 ++ 57 files changed, 547 insertions(+), 440 deletions(-) create mode 100644 src/data/assign_to_day_segment.R create mode 100644 src/data/process_location_types.R delete mode 100644 src/data/resample_fused_location.R create mode 100644 src/features/join_features_from_providers.R rename src/features/{location_barnett => location/barnett/library}/AvgFlightDur.R (100%) rename src/features/{location_barnett => location/barnett/library}/AvgFlightLen.R (100%) rename src/features/{location_barnett => location/barnett/library}/Collapse2Pause.R (100%) rename src/features/{location_barnett => location/barnett/library}/DailyMobilityPlots.R (100%) rename src/features/{location_barnett => location/barnett/library}/DailyRoutineIndex.R (100%) rename src/features/{location_barnett => location/barnett/library}/DayDist.R (100%) rename src/features/{location_barnett => location/barnett/library}/DistanceTravelled.R (100%) rename src/features/{location_barnett => location/barnett/library}/ExtractFlights.R (100%) rename src/features/{location_barnett => location/barnett/library}/ExtractTimePeriod.R (100%) rename src/features/{location_barnett => location/barnett/library}/GPS2MobMat.R (100%) rename src/features/{location_barnett => location/barnett/library}/GPSmobility-internal.R (100%) rename src/features/{location_barnett => location/barnett/library}/GetMobilityFeaturesMat.R (100%) rename src/features/{location_barnett => location/barnett/library}/GuessPause.R (100%) rename src/features/{location_barnett => location/barnett/library}/Hometime.R (100%) rename src/features/{location_barnett => location/barnett/library}/InitializeParams.R (100%) rename src/features/{location_barnett => location/barnett/library}/IsFlight.R (100%) rename src/features/{location_barnett => location/barnett/library}/LatLong2XY.R (100%) rename src/features/{location_barnett => location/barnett/library}/LocationAt.R (100%) rename src/features/{location_barnett => location/barnett/library}/MaxDiam.R (100%) rename src/features/{location_barnett => location/barnett/library}/MaxDistBetweenTrajectories.R (100%) rename src/features/{location_barnett => location/barnett/library}/MaxHomeDist.R (100%) rename src/features/{location_barnett => location/barnett/library}/MaxRadius.R (100%) rename src/features/{location_barnett => location/barnett/library}/MinsMissing.R (100%) rename src/features/{location_barnett => location/barnett/library}/MobilityFeatures.R (100%) rename src/features/{location_barnett => location/barnett/library}/MobmatQualityOK.R (100%) rename src/features/{location_barnett => location/barnett/library}/ProbPause.R (100%) rename src/features/{location_barnett => location/barnett/library}/ProgressBar.R (100%) rename src/features/{location_barnett => location/barnett/library}/RadiusOfGyration.R (100%) rename src/features/{location_barnett => location/barnett/library}/RandomBridge.R (100%) rename src/features/{location_barnett => location/barnett/library}/SigLocEntropy.R (100%) rename src/features/{location_barnett => location/barnett/library}/SigLocs.R (100%) rename src/features/{location_barnett => location/barnett/library}/SigLocsVisited.R (100%) rename src/features/{location_barnett => location/barnett/library}/SimulateMobilityGaps.R (100%) rename src/features/{location_barnett => location/barnett/library}/StdFlightDur.R (100%) rename src/features/{location_barnett => location/barnett/library}/StdFlightLen.R (100%) rename src/features/{location_barnett => location/barnett/library}/WriteSurveyAnswers2File.R (100%) rename src/features/{location_barnett => location/barnett/library}/plot.flights.R (100%) rename src/features/{location_barnett => location/barnett/library}/plotlimits.R (100%) create mode 100644 src/features/location/barnett/main.R rename src/features/{location_doryab/location_base.py => location/doryab/main.py} (69%) create mode 100644 src/features/location/locations_entry.R create mode 100644 src/features/location/locations_entry.py delete mode 100644 src/features/location_barnett_features.R delete mode 100644 src/features/location_doryab_features.py create mode 100644 src/features/utils/utils.R create mode 100644 src/features/utils/utils.py diff --git a/Snakefile b/Snakefile index 2874e7a4..e56c59a9 100644 --- a/Snakefile +++ b/Snakefile @@ -43,17 +43,6 @@ if config["CALLS"]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"])) files_to_compute.extend(expand("data/processed/{pid}/calls_{call_type}.csv", pid=config["PIDS"], call_type=config["CALLS"]["TYPES"])) -if config["BARNETT_LOCATION"]["COMPUTE"]: - if config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED": - if config["BARNETT_LOCATION"]["DB_TABLE"] in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]: - files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_resampled.csv", pid=config["PIDS"], sensor=config["BARNETT_LOCATION"]["DB_TABLE"])) - else: - raise ValueError("Error: Add your locations table (and as many sensor tables as you have) to [PHONE_VALID_SENSED_BINS][DB_TABLES] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)") - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["BARNETT_LOCATION"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["BARNETT_LOCATION"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/location_barnett_{day_segment}.csv", pid=config["PIDS"], day_segment = config["BARNETT_LOCATION"]["DAY_SEGMENTS"])) - if config["BLUETOOTH"]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["BLUETOOTH"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["BLUETOOTH"]["DB_TABLE"])) @@ -142,16 +131,19 @@ if config["CONVERSATION"]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=pids, sensor=table)) files_to_compute.extend(expand("data/processed/{pid}/conversation_{day_segment}.csv",pid=config["PIDS"], day_segment = config["CONVERSATION"]["DAY_SEGMENTS"])) -if config["DORYAB_LOCATION"]["COMPUTE"]: - if config["DORYAB_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED": - if config["DORYAB_LOCATION"]["DB_TABLE"] in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]: - files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_resampled.csv", pid=config["PIDS"], sensor=config["DORYAB_LOCATION"]["DB_TABLE"])) - else: - raise ValueError("Error: Add your locations table (and as many sensor tables as you have) to [PHONE_VALID_SENSED_BINS][DB_TABLES] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)") - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["DORYAB_LOCATION"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["DORYAB_LOCATION"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/location_doryab_{segment}.csv", pid=config["PIDS"], segment = config["DORYAB_LOCATION"]["DAY_SEGMENTS"])) +for provider in config["LOCATIONS"]["PROVIDERS"].keys(): + if config["LOCATIONS"]["PROVIDERS"][provider]["COMPUTE"]: + if config["LOCATIONS"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED": + if config["LOCATIONS"]["DB_TABLE"] in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]: + files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"])) + else: + raise ValueError("Error: Add your locations table (and as many sensor tables as you have) to [PHONE_VALID_SENSED_BINS][DB_TABLES] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)") + + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["LOCATIONS"]["DB_TABLE"])) + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["LOCATIONS"]["DB_TABLE"])) + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_processed_{locations_to_use}.csv", pid=config["PIDS"], sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"])) + files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["LOCATIONS"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="LOCATIONS".lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="LOCATIONS".lower())) # visualization for data exploration if config["HEATMAP_FEATURES_CORRELATIONS"]["PLOT"]: diff --git a/config.yaml b/config.yaml index 99bd4346..3884b70c 100644 --- a/config.yaml +++ b/config.yaml @@ -67,33 +67,34 @@ APPLICATION_GENRES: UPDATE_CATALOGUE_FILE: false # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE SCRAPE_MISSING_GENRES: false # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway -RESAMPLE_FUSED_LOCATION: - CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold - TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row - TIMEZONE: *timezone - -BARNETT_LOCATION: - COMPUTE: False +LOCATIONS: DB_TABLE: locations - DAY_SEGMENTS: [daily] # These features are only available on a daily basis - FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"] - LOCATIONS_TO_USE: ALL # ALL, ALL_EXCEPT_FUSED OR RESAMPLE_FUSED - ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius + LOCATIONS_TO_USE: FUSED_RESAMPLED # ALL, GPS OR FUSED_RESAMPLED + FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold + FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row TIMEZONE: *timezone - MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features + + PROVIDERS: + DORYAB: + COMPUTE: True + FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"] + DBSCAN_EPS: 10 # meters + DBSCAN_MINSAMPLES: 5 + THRESHOLD_STATIC : 1 # km/h + MAXIMUM_GAP_ALLOWED: 300 + MINUTES_DATA_USED: True + SAMPLING_FREQUENCY: 0 + SRC_FOLDER: "doryab" + SRC_LANGUAGE: "python" -DORYAB_LOCATION: - COMPUTE: False - DB_TABLE: locations - DAY_SEGMENTS: *day_segments - FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"] - LOCATIONS_TO_USE: ALL # ALL, ALL_EXCEPT_FUSED OR RESAMPLE_FUSED - DBSCAN_EPS: 10 # meters - DBSCAN_MINSAMPLES: 5 - THRESHOLD_STATIC : 1 # km/h - MAXIMUM_GAP_ALLOWED: 300 - MINUTES_DATA_USED: False - SAMPLING_FREQUENCY: 0 + BARNETT: + COMPUTE: True + FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"] + ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius + TIMEZONE: *timezone + MINUTES_DATA_USED: True # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features + SRC_FOLDER: "barnett" + SRC_LANGUAGE: "r" BLUETOOTH: COMPUTE: False diff --git a/docs/features/extracted.rst b/docs/features/extracted.rst index 14f50b59..1e89f46f 100644 --- a/docs/features/extracted.rst +++ b/docs/features/extracted.rst @@ -579,17 +579,18 @@ are computed. See Ian Barnett, Jukka-Pekka Onnela, Inferring mobility measures f See `Location (Barnett’s) Config Code`_ -**Available Epochs (day_segment) :** daily +**Available Day Segments (epochs) :** only daily periods of EVERY_DAY_INTERVAL or FLEXIBLE_DAY_INTERVAL (periods that start at 00:00:00 and end at 23:59:59 on the same day) **Available Platforms:** Android and iOS **Snakemake rule chain:** -- Rule ``rules/preprocessing.snakefile/download_dataset`` -- Rule ``rules/preprocessing.snakefile/readable_datetime`` -- Rule ``rules/preprocessing.snakefile/phone_sensed_bins`` -- Rule ``rules/preprocessing.snakefile/resample_fused_location`` (only relevant if setting ``location_to_use`` to ````RESAMPLE_FUSED``. -- Rule ``rules/features.snakefile/location_barnett_features`` +- Rule ``rules/preprocessing.snakefile/download_dataset`` (de duplication and sorting by timestamp) +- Rule ``rules/preprocessing.snakefile/readable_datetime`` (add local date and time components, add local day segment) +- Rule ``rules/preprocessing.snakefile/phone_sensed_bins`` (get the periods of time the phone was sensing data to resample over them) +- Rule ``rules/preprocessing.snakefile/process_location_types`` (filter gps data or resample fused location, deletes (0,0) coordinates) +- Rule ``rules/features.snakefile/locations_r_features`` (RAPIDS executes ``barnett_location_features`` from ``src/features/location/barnett/main.R`) +- Rule ``rules/features.snakefile/join_features_from_providers`` (joins the location features of all python and r providers) .. _location-parameters: @@ -598,7 +599,7 @@ See `Location (Barnett’s) Config Code`_ ================= =================== Name Description ================= =================== -location_to_use *Read the Observations section below*. The specifies what type of location data will be use in the analysis. Possible options are ``ALL``, ``ALL_EXCEPT_FUSED`` OR ``RESAMPLE_FUSED`` +location_to_use *Read the Observations section below*. The specifies what type of location data will be use in the analysis. Possible options are ``ALL``, ``GPS`` OR ``RESAMPLE_FUSED`` accuracy_limit This is in meters. The sensor drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius specified. timezone The timezone used to calculate location. minutes_data_used This is NOT a feature. This is just a quality control check, and if set to TRUE, a new column is added to the output file with the number of minutes containing location data that were used to compute all features. The more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough. @@ -634,15 +635,15 @@ wkenddayrtn Same as circdnrtn but computed separately for w *Types of location data to use* -Aware Android and iOS clients can collect location coordinates through the phone's GPS or Google's fused location API. If your Aware client was ONLY configured to use GPS set ``location_to_use`` to ``ALL``, if your client was configured to use BOTH GPS and fused location you can use ``ALL`` or set ``location_to_use`` to ``ALL_EXCEPT_FUSED`` to ignore fused coordinates, if your client was configured to use fused location only, set ``location_to_use`` to ``RESAMPLE_FUSED``. ``RESAMPLE_FUSED`` takes the original fused location coordinates and replicates each pair forward in time as long as the phone was sensing data as indicated by ``phone_sensed_bins`` (see :ref:`Phone valid sensed days `), this is done because Google's API only logs a new location coordinate pair when it is sufficiently different from the previous one. +Aware Android and iOS clients can collect location coordinates through the phone's GPS, the network cellular towers around the phone or Google's fused location API. If you want to use only the GPS provider set ``location_to_use`` to ``GPS``, if you want to use all providers (not recommended due to the difference in accuracy) set ``location_to_use`` to ``ALL``, if your Aware client was configured to use fused location only or want to focus only on this provider, set ``location_to_use`` to ``RESAMPLE_FUSED``. ``RESAMPLE_FUSED`` takes the original fused location coordinates and replicates each pair forward in time as long as the phone was sensing data as indicated by ``phone_sensed_bins`` (see :ref:`Phone valid sensed days `), this is done because Google's API only logs a new location coordinate pair when it is sufficiently different in time or space from the previous one. -There are two parameters associated with resampling fused location in the ``RESAMPLE_FUSED_LOCATION`` section of the ``config.yaml`` file. ``CONSECUTIVE_THRESHOLD`` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair (for example, participant A's phone did not collect data between 10.30am and 10:50am and between 11:05am and 11:40am, the last known coordinate pair will be replicated during the first period but not the second, in other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes). ``TIME_SINCE_VALID_LOCATION`` (in minutes, default 720 or 12 hours) the last known fused location won't be carried over longer that this threshold even if the phone was sensing data continuously (for example, participant A went home at 9pm and their phone was sensing data without gaps until 11am the next morning, the last known location will only be replicated until 9am). If you have suggestions to modify or improve this imputation, let us know. +There are two parameters associated with resampling fused location in the ``LOCATIONS`` section of the ``config.yaml`` file. ``RESAMPLE_FUSED_CONSECUTIVE_THRESHOLD`` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair (for example, participant A's phone did not collect data between 10.30am and 10:50am and between 11:05am and 11:40am, the last known coordinate pair will be replicated during the first period but not the second, in other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes). ``RESAMPLE_FUSED_TIME_SINCE_VALID_LOCATION`` (in minutes, default 720 or 12 hours) makes that the last known fused location won't be carried over longer that this threshold even if the phone was sensing data continuously (for example, participant A went home at 9pm and their phone was sensing data without gaps until 11am the next morning, the last known location will only be replicated until 9am). If you have suggestions to modify or improve this imputation, let us know. *Barnett's et al features* These features are based on a Pause-Flight model. A pause is defined as a mobiity trace (location pings) within a certain duration and distance (by default 300 seconds and 60 meters). A flight is any mobility trace between two pauses. Data is resampled and imputed before the features are computed. See this paper for more information: https://doi.org/10.1093/biostatistics/kxy059. -In RAPIDS we only expose two parameters for these features (timezone and accuracy). If you wish to change others you can do so in ``src/features/location_barnett/MobilityFeatures.R`` +In RAPIDS we only expose two parameters for these features (timezone and accuracy). If you wish to change others you can do so in ``src/features/location/barnett/library/MobilityFeatures.R`` *Significant Locations* @@ -660,17 +661,18 @@ Doryab's location features are based on this paper: Doryab, A., Chikarsel, P., L See `Location (Doryab's) Config Code`_ -**Available Epochs (day_segment) :** daily, morning, afternoon, evening, night +**Available Day Segments (epochs):** any of EVERY_DAY_FREQUENCY, EVERY_DAY_INTERVAL and FLEXIBLE_DAY_INTERVAL **Available Platforms:** Android and iOS **Snakemake rule chain:** -- Rule ``rules/preprocessing.snakefile/download_dataset`` -- Rule ``rules/preprocessing.snakefile/readable_datetime`` -- Rule ``rules/preprocessing.snakefile/phone_sensed_bins`` -- Rule ``rules/preprocessing.snakefile/resample_fused_location`` (only relevant if setting ``location_to_use`` to ````RESAMPLE_FUSED``. -- Rule ``rules/features.snakefile/location_doryab_features`` +- Rule ``rules/preprocessing.snakefile/download_dataset`` (de duplication and sorting by timestamp) +- Rule ``rules/preprocessing.snakefile/readable_datetime`` (add local date and time components, add local day segment) +- Rule ``rules/preprocessing.snakefile/phone_sensed_bins`` (get the periods of time the phone was sensing data to resample over them) +- Rule ``rules/preprocessing.snakefile/process_location_types`` (filter gps data or resample fused location, deletes (0,0) coordinates) +- Rule ``rules/features.snakefile/locations_python_features`` (RAPIDS executes ``doryab_location_features`` from ``src/features/location/doryab/main.py`) +- Rule ``rules/features.snakefile/join_features_from_providers`` (joins the location features of all python and r providers) .. _location-doryab-parameters: @@ -680,7 +682,7 @@ See `Location (Doryab's) Config Code`_ Name Description =================== =================== day_segment The particular ``day_segment`` that will be analyzed. The available options are ``daily``, ``morning``, ``afternoon``, ``evening``, ``night`` -location_to_use *Read the Observations section below*. The specifies what type of location data will be use in the analysis. Possible options are ``ALL``, ``ALL_EXCEPT_FUSED`` OR ``RESAMPLE_FUSED``. +location_to_use *Read the Observations section below*. The specifies what type of location data will be use in the analysis. Possible options are ``ALL``, ``GPS`` OR ``RESAMPLE_FUSED``. features Features to be computed, see table below. threshold_static It is the threshold value in km/hr which labels a row as Static or Moving. dbscan_minsamples The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. @@ -723,9 +725,9 @@ normalizedlocationentropy nats Shannon Entropy computed *Types of location data to use* -Aware Android and iOS clients can collect location coordinates through the phone's GPS or Google's fused location API. If your Aware client was ONLY configured to use GPS set ``location_to_use`` to ``ALL``, if your client was configured to use BOTH GPS and fused location you can use ``ALL`` or set ``location_to_use`` to ``ALL_EXCEPT_FUSED`` to ignore fused coordinates, if your client was configured to use fused location only, set ``location_to_use`` to ``RESAMPLE_FUSED``. ``RESAMPLE_FUSED`` takes the original fused location coordinates and replicates each pair forward in time as long as the phone was sensing data as indicated by ``phone_sensed_bins`` (see :ref:`Phone valid sensed days `), this is done because Google's API only logs a new location coordinate pair when it is sufficiently different from the previous one. +Aware Android and iOS clients can collect location coordinates through the phone's GPS, the network cellular towers around the phone or Google's fused location API. If you want to use only the GPS provider set ``location_to_use`` to ``GPS``, if you want to use all providers (not recommended due to the difference in accuracy) set ``location_to_use`` to ``ALL``, if your Aware client was configured to use fused location only or want to focus only on this provider, set ``location_to_use`` to ``RESAMPLE_FUSED``. ``RESAMPLE_FUSED`` takes the original fused location coordinates and replicates each pair forward in time as long as the phone was sensing data as indicated by ``phone_sensed_bins`` (see :ref:`Phone valid sensed days `), this is done because Google's API only logs a new location coordinate pair when it is sufficiently different in time or space from the previous one. -There are two parameters associated with resampling fused location in the ``RESAMPLE_FUSED_LOCATION`` section of the ``config.yaml`` file. ``CONSECUTIVE_THRESHOLD`` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair (for example, participant A's phone did not collect data between 10.30am and 10:50am and between 11:05am and 11:40am, the last known coordinate pair will be replicated during the first period but not the second, in other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes). ``TIME_SINCE_VALID_LOCATION`` (in minutes, default 720 or 12 hours) the last known fused location won't be carried over longer that this threshold even if the phone was sensing data continuously (for example, participant A went home at 9pm and their phone was sensing data without gaps until 11am the next morning, the last known location will only be replicated until 9am). If you have suggestions to modify or improve this imputation, let us know. +There are two parameters associated with resampling fused location in the ``LOCATIONS`` section of the ``config.yaml`` file. ``RESAMPLE_FUSED_CONSECUTIVE_THRESHOLD`` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair (for example, participant A's phone did not collect data between 10.30am and 10:50am and between 11:05am and 11:40am, the last known coordinate pair will be replicated during the first period but not the second, in other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes). ``RESAMPLE_FUSED_TIME_SINCE_VALID_LOCATION`` (in minutes, default 720 or 12 hours) makes that the last known fused location won't be carried over longer that this threshold even if the phone was sensing data continuously (for example, participant A went home at 9pm and their phone was sensing data without gaps until 11am the next morning, the last known location will only be replicated until 9am). If you have suggestions to modify or improve this imputation, let us know. *Significant Locations Identified* diff --git a/rules/common.smk b/rules/common.smk index 68747403..5f7ceed0 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -26,23 +26,13 @@ def optional_phone_sensed_bins_input(wildcards): return expand("data/raw/{{pid}}/{table}_with_datetime.csv", table = tables_platform) -def find_day_segments_input_file(wildcards): - for key, values in config.items(): - if "DB_TABLE" in config[key] and config[key]["DB_TABLE"] == wildcards.sensor: - if "DAY_SEGMENTS" in config[key]: - return config[key]["DAY_SEGMENTS"]["FILE"] - else: - raise ValueError("{} should have a [DAY_SEGMENTS][FILE] parameter containing the path to its day segments file".format(wildcards.sensor)) - -def find_day_segments_input_type(wildcards): - for key, values in config.items(): - if "DB_TABLE" in config[key] and config[key]["DB_TABLE"] == wildcards.sensor: - if "DAY_SEGMENTS" in config[key]: - return config[key]["DAY_SEGMENTS"]["TYPE"] - else: - raise ValueError("{} should have a [DAY_SEGMENTS][TYPE] parameter containing INTERVAL, FREQUENCY, or EVENT".format(wildcards.sensor)) - # Features.smk ######################################################################################################### +def find_features_files(wildcards): + feature_files = [] + for provider_key, provider in config[(wildcards.sensor_key).upper()]["PROVIDERS"].items(): + if provider["COMPUTE"]: + feature_files.extend(expand("data/interim/{{pid}}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", sensor_key=(wildcards.sensor_key).lower(), language=provider["SRC_LANGUAGE"].lower(), provider_key=provider_key)) + return(feature_files) def optional_ar_input(wildcards): platform = infer_participant_platform("data/external/"+wildcards.pid) @@ -62,18 +52,6 @@ def optional_conversation_input(wildcards): elif platform == "ios": return ["data/raw/{pid}/" + config["CONVERSATION"]["DB_TABLE"]["IOS"] + "_with_datetime_unified.csv"] -def optional_location_barnett_input(wildcards): - if config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED": - return expand("data/raw/{{pid}}/{sensor}_resampled.csv", sensor=config["BARNETT_LOCATION"]["DB_TABLE"]) - else: - return expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BARNETT_LOCATION"]["DB_TABLE"]) - -def optional_location_doryab_input(wildcards): - if config["DORYAB_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED": - return expand("data/raw/{{pid}}/{sensor}_resampled.csv", sensor=config["DORYAB_LOCATION"]["DB_TABLE"]) - else: - return expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["DORYAB_LOCATION"]["DB_TABLE"]) - def optional_steps_sleep_input(wildcards): if config["STEP"]["EXCLUDE_SLEEP"]["EXCLUDE"] == True and config["STEP"]["EXCLUDE_SLEEP"]["TYPE"] == "FITBIT_BASED": return "data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv" diff --git a/rules/features.smk b/rules/features.smk index a4302710..911e7cff 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -1,3 +1,11 @@ +rule join_features_from_providers: + input: + location_features = find_features_files + output: + "data/processed/features/{pid}/{sensor_key}.csv" + script: + "../src/features/join_features_from_providers.R" + rule messages_features: input: expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["MESSAGES"]["DB_TABLE"]), @@ -54,37 +62,29 @@ rule ios_activity_recognition_deltas: script: "../src/features/activity_recognition_deltas.R" -rule location_barnett_features: +rule locations_python_features: input: - locations = optional_location_barnett_input + location_data = expand("data/raw/{{pid}}/{sensor}_processed_{locations_to_use}.csv", sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"]), + day_segments_labels = "data/interim/day_segments_labels.csv" params: - features = config["BARNETT_LOCATION"]["FEATURES"], - locations_to_use = config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"], - accuracy_limit = config["BARNETT_LOCATION"]["ACCURACY_LIMIT"], - timezone = config["BARNETT_LOCATION"]["TIMEZONE"], - minutes_data_used = config["BARNETT_LOCATION"]["MINUTES_DATA_USED"], - day_segment = "{day_segment}" + provider = lambda wildcards: config["LOCATIONS"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}", output: - "data/processed/{pid}/location_barnett_{day_segment}.csv" + "data/interim/{pid}/locations_features/locations_python_{provider_key}.csv" script: - "../src/features/location_barnett_features.R" + "../src/features/location/locations_entry.py" -rule location_doryab_features: +rule locations_r_features: input: - locations = optional_location_doryab_input + location_data = expand("data/raw/{{pid}}/{sensor}_processed_{locations_to_use}.csv", sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"]), + day_segments_labels = "data/interim/day_segments_labels.csv" params: - features = config["DORYAB_LOCATION"]["FEATURES"], - day_segment = "{day_segment}", - dbscan_eps = config["DORYAB_LOCATION"]["DBSCAN_EPS"], - dbscan_minsamples = config["DORYAB_LOCATION"]["DBSCAN_MINSAMPLES"], - threshold_static = config["DORYAB_LOCATION"]["THRESHOLD_STATIC"], - maximum_gap_allowed = config["DORYAB_LOCATION"]["MAXIMUM_GAP_ALLOWED"], - minutes_data_used = config["DORYAB_LOCATION"]["MINUTES_DATA_USED"], - sampling_frequency = config["DORYAB_LOCATION"]["SAMPLING_FREQUENCY"] + provider = lambda wildcards: config["LOCATIONS"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}" output: - "data/processed/{pid}/location_doryab_{day_segment}.csv" + "data/interim/{pid}/locations_features/locations_r_{provider_key}.csv" script: - "../src/features/location_doryab_features.py" + "../src/features/location/locations_entry.R" rule bluetooth_features: input: diff --git a/rules/preprocessing.smk b/rules/preprocessing.smk index 4d8e9439..c8f98132 100644 --- a/rules/preprocessing.smk +++ b/rules/preprocessing.smk @@ -40,17 +40,17 @@ rule download_dataset: rule compute_day_segments: input: - find_day_segments_input_file + config["DAY_SEGMENTS"]["FILE"] params: - day_segments_type = find_day_segments_input_type + day_segments_type = config["DAY_SEGMENTS"]["TYPE"] output: - segments_file = "data/interim/{sensor}_day_segments.csv", - segments_labels_file = "data/interim/{sensor}_day_segments_labels.csv", + segments_file = "data/interim/day_segments.csv", + segments_labels_file = "data/interim/day_segments_labels.csv", script: "../src/data/compute_day_segments.py" PHONE_SENSORS = [] -PHONE_SENSORS.extend([config["MESSAGES"]["DB_TABLE"], config["CALLS"]["DB_TABLE"], config["BARNETT_LOCATION"]["DB_TABLE"], config["DORYAB_LOCATION"]["DB_TABLE"], config["BLUETOOTH"]["DB_TABLE"], config["BATTERY"]["DB_TABLE"], config["SCREEN"]["DB_TABLE"], config["LIGHT"]["DB_TABLE"], config["ACCELEROMETER"]["DB_TABLE"], config["APPLICATIONS_FOREGROUND"]["DB_TABLE"], config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]) +PHONE_SENSORS.extend([config["MESSAGES"]["DB_TABLE"], config["CALLS"]["DB_TABLE"], config["LOCATIONS"]["DB_TABLE"], config["BLUETOOTH"]["DB_TABLE"], config["BATTERY"]["DB_TABLE"], config["SCREEN"]["DB_TABLE"], config["LIGHT"]["DB_TABLE"], config["ACCELEROMETER"]["DB_TABLE"], config["APPLICATIONS_FOREGROUND"]["DB_TABLE"], config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]) PHONE_SENSORS.extend(config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]) if len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0: @@ -62,11 +62,11 @@ if len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) > 0: rule readable_datetime: input: sensor_input = "data/raw/{pid}/{sensor}_raw.csv", - day_segments = "data/interim/{sensor}_day_segments.csv" + day_segments = "data/interim/day_segments.csv" params: timezones = None, fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"], - day_segments_type = find_day_segments_input_type + day_segments_type = config["DAY_SEGMENTS"]["TYPE"] wildcard_constraints: sensor = '.*(' + '|'.join([re.escape(x) for x in PHONE_SENSORS]) + ').*' # only process smartphone sensors, not fitbit output: @@ -108,19 +108,22 @@ rule unify_ios_android: script: "../src/data/unify_ios_android.R" -rule resample_fused_location: +rule process_location_types: input: - locations = "data/raw/{pid}/{sensor}_raw.csv", - phone_sensed_bins = rules.phone_sensed_bins.output + locations = "data/raw/{pid}/{sensor}_with_datetime.csv", + phone_sensed_bins = rules.phone_sensed_bins.output, + day_segments = "data/interim/day_segments.csv" params: bin_size = config["PHONE_VALID_SENSED_BINS"]["BIN_SIZE"], - timezone = config["RESAMPLE_FUSED_LOCATION"]["TIMEZONE"], - consecutive_threshold = config["RESAMPLE_FUSED_LOCATION"]["CONSECUTIVE_THRESHOLD"], - time_since_valid_location = config["RESAMPLE_FUSED_LOCATION"]["TIME_SINCE_VALID_LOCATION"] + timezone = config["LOCATIONS"]["TIMEZONE"], + consecutive_threshold = config["LOCATIONS"]["FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD"], + time_since_valid_location = config["LOCATIONS"]["FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION"], + day_segments_type = config["DAY_SEGMENTS"]["TYPE"], + locations_to_use = "{locations_to_used}" output: - "data/raw/{pid}/{sensor}_resampled.csv" + "data/raw/{pid}/{sensor}_processed_{locations_to_used}.csv" script: - "../src/data/resample_fused_location.R" + "../src/data/process_location_types.R" rule application_genres: input: diff --git a/src/data/assign_to_day_segment.R b/src/data/assign_to_day_segment.R new file mode 100644 index 00000000..6dc3ee4b --- /dev/null +++ b/src/data/assign_to_day_segment.R @@ -0,0 +1,85 @@ +library("tidyverse") +library("lubridate") + +assign_to_day_segment <- function(data, day_segments, day_segments_type, fixed_timezone){ + + if(day_segments_type == "FREQUENCY_EVERY_DAY"){ + data <- data %>% mutate(local_date_time_obj = lubridate::parse_date_time(local_time, orders = c("HMS", "HM"))) + day_segments <- day_segments %>% mutate(start_time = lubridate::parse_date_time(start_time, orders = c("HMS", "HM")), + end_time = start_time + minutes(length)) + + # Create a new column for each day_segment + for(row_id in 1:nrow(day_segments)){ + row = day_segments[row_id,] + data <- data %>% mutate(!!paste("local_day_segment", row_id, sep = "_") := ifelse(local_date_time_obj >= row$start_time & local_date_time_obj < row$end_time, + paste0("[", + row$label, "_", + local_date, "_", + paste(str_pad(hour(row$start_time),2, pad="0"), str_pad(minute(row$start_time),2, pad="0"), str_pad(second(row$start_time),2, pad="0"),sep =":"), + "]"), NA)) + } + + } else if (day_segments_type == "INTERVAL_EVERY_DAY"){ + + data_dates <- data %>% select(local_date) %>% distinct(local_date) + inferred_day_segments <- crossing(day_segments, data_dates) %>% + mutate(start_local_date_time_obj = lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = fixed_timezone), + end_local_date_time_obj = start_local_date_time_obj + lubridate::period(length), + date_time_interval = lubridate::interval(start_local_date_time_obj, end_local_date_time_obj)) %>% + group_by(label, local_date) %>% + mutate(group_start_datetime = lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = fixed_timezone), + group_end_datetime = group_start_datetime + lubridate::period(length), + group_start_datetime = min(group_start_datetime), + group_end_datetime = max(group_end_datetime)) %>% + ungroup() + + + data <- data %>% mutate(local_date_time_obj = lubridate::ymd_hms(local_date_time, tz = fixed_timezone)) + + # Create a new column for each day_segment + for(row_id in 1:nrow(inferred_day_segments)){ + row = inferred_day_segments[row_id,] + data <- data %>% mutate(!!paste("local_day_segment", row_id, sep = "_") := ifelse(local_date_time_obj %within% row$date_time_interval, + paste0("[", + paste(sep= "#", + row$label, + lubridate::date(row$group_start_datetime), + paste(str_pad(hour(row$group_start_datetime),2, pad="0"), str_pad(minute(row$group_start_datetime),2, pad="0"), str_pad(second(row$group_start_datetime),2, pad="0"),sep =":"), + lubridate::date(row$group_end_datetime), + paste(str_pad(hour(row$group_end_datetime),2, pad="0"), str_pad(minute(row$group_end_datetime),2, pad="0"), str_pad(second(row$group_end_datetime),2, pad="0"),sep =":") + ), + "]"), NA)) + } + + + } else if ( day_segments_type == "INTERVAL_FLEXIBLE_DAY"){ + data <- data %>% mutate(local_date_time_obj = lubridate::ymd_hms(local_date_time, tz = fixed_timezone)) + day_segments <- day_segments %>% mutate(shift = ifelse(shift == "0", "0seconds", shift), + start_local_date_time_obj = lubridate::ymd_hms(start_date_time, tz = fixed_timezone) + (lubridate::period(shift) * ifelse(shift_direction >= 0, 1, -1)), + end_local_date_time_obj = start_local_date_time_obj + lubridate::period(length), + date_time_interval = lubridate::interval(start_local_date_time_obj, end_local_date_time_obj)) + + # Create a new column for each day_segment + for(row_id in 1:nrow(day_segments)){ + row = day_segments[row_id,] + print(row$length) + data <- data %>% mutate(!!paste("local_day_segment", row_id, sep = "_") := ifelse(local_date_time_obj %within% row$date_time_interval, + paste0("[", + paste(sep= "#", + row$label, + lubridate::date(row$start_local_date_time_obj), + paste(str_pad(hour(row$start_local_date_time_obj),2, pad="0"), str_pad(minute(row$start_local_date_time_obj),2, pad="0"), str_pad(second(row$start_local_date_time_obj),2, pad="0"),sep =":"), + lubridate::date(row$end_local_date_time_obj), + paste(str_pad(hour(row$end_local_date_time_obj),2, pad="0"), str_pad(minute(row$end_local_date_time_obj),2, pad="0"), str_pad(second(row$end_local_date_time_obj),2, pad="0"),sep =":") + ), + "]"), NA)) + } + } + + # Join all day_segments in a single column + data <- data %>% + unite("assigned_segments", starts_with("local_day_segment"), sep = "|", na.rm = TRUE) %>% + select(-local_date_time_obj) + + return(data) +} \ No newline at end of file diff --git a/src/data/process_location_types.R b/src/data/process_location_types.R new file mode 100644 index 00000000..2165be51 --- /dev/null +++ b/src/data/process_location_types.R @@ -0,0 +1,76 @@ +source("renv/activate.R") +library(dplyr) +library(readr) +library(tidyr) + +source("src/data/assign_to_day_segment.R") + +bin_size <- snakemake@params[["bin_size"]] +timezone <- snakemake@params[["timezone"]] +consecutive_threshold <- snakemake@params[["consecutive_threshold"]] +time_since_valid_location <- snakemake@params[["time_since_valid_location"]] +location_to_used <- snakemake@params[["time_since_valocation_to_usedlid_location"]] +day_segments <- read.csv(snakemake@input[["day_segments"]]) +day_segments_type <- snakemake@params[["day_segments_type"]] + +phone_sensed_bins <- read_csv(snakemake@input[["phone_sensed_bins"]], col_types = cols(local_date = col_character())) +locations <- read_csv(snakemake@input[["locations"]], col_types = cols()) %>% filter(provider == "fused") %>% + filter(double_latitude != 0 & double_longitude != 0) + +locations_to_use <- snakemake@params["locations_to_use"] +if(!locations_to_use %in% c("ALL", "FUSED_RESAMPLED", "GPS")){ + print("Unkown location filter, provide one of the following three: ALL, GPS, or FUSED_RESAMPLED") + quit(save = "no", status = 1, runLast = FALSE) + } + + +if(locations_to_use == "ALL"){ + processed_locations <- locations +} else if(locations_to_use == "GPS"){ + processed_locations <- locations %>% filter(provider == "gps") +} else if(locations_to_use == "FUSED_RESAMPLED"){ + locations <- locations %>% filter(provider == "fused") + if(nrow(locations) > 0){ + sensed_minute_bins <- phone_sensed_bins %>% + pivot_longer(-local_date, names_to = c("hour", "bin"), names_sep = "_", values_to = "sensor_count") %>% + mutate(hour = as.integer(hour), bin = as.integer(bin)) %>% + complete(nesting(local_date, hour), bin = seq(0, 59,1)) %>% + fill(sensor_count) %>% + mutate(timestamp = as.numeric(as.POSIXct(paste0(local_date, " ", hour,":", bin,":00"), format = "%Y-%m-%d %H:%M:%S", tz = timezone)) * 1000 ) %>% + filter(sensor_count > 0) %>% + select(timestamp) + + resampled_locations <- locations %>% + select(-assigned_segments) %>% + bind_rows(sensed_minute_bins) %>% + mutate(provider = replace_na(provider, "resampled")) %>% + arrange(timestamp) %>% + # We group and therefore, fill in, missing rows that appear after a valid fused location record and exist + # within consecutive_threshold minutes from each other + mutate(consecutive_time_diff = c(1, diff(timestamp)), + resample_group = cumsum(!is.na(double_longitude) | consecutive_time_diff > (1000 * 60 * consecutive_threshold))) %>% + group_by(resample_group) %>% + # drop rows that are logged after time_since_valid_location minutes from the last valid fused location + filter((timestamp - first(timestamp) < (1000 * 60 * time_since_valid_location))) %>% + fill(-timestamp, -resample_group) %>% + select(-consecutive_time_diff) %>% + drop_na(double_longitude, double_latitude, accuracy) %>% + # Add local date_time + mutate(utc_date_time = as.POSIXct(timestamp/1000, origin="1970-01-01", tz="UTC"), + local_date_time = format(utc_date_time, tz = timezone, usetz = F)) %>% + separate(local_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>% + separate(local_time, c("local_hour", "local_minute"), ":", remove = FALSE, extra = "drop") %>% + mutate(local_hour = as.numeric(local_hour), + local_minute = as.numeric(local_minute)) %>% + # Delete resampled rows that exist in the same minute as other original (fused) rows + group_by(local_date, local_hour, local_minute) %>% + mutate(n = n()) %>% + filter(n == 1 | (n > 1 & provider == "fused")) %>% + select(-n) %>% + ungroup() + processed_locations <- assign_to_day_segment(resampled_locations, day_segments, day_segments_type, timezone) + } else { + processed_locations <- locations + } +} +write.csv(processed_locations,snakemake@output[[1]], row.names = F) diff --git a/src/data/readable_datetime.R b/src/data/readable_datetime.R index b220df74..7751fcb3 100644 --- a/src/data/readable_datetime.R +++ b/src/data/readable_datetime.R @@ -1,8 +1,8 @@ source("renv/activate.R") - library("tidyverse") library("readr") -library("lubridate") + +source("src/data/assign_to_day_segment.R") input <- read.csv(snakemake@input[["sensor_input"]]) %>% arrange(timestamp) day_segments <- read.csv(snakemake@input[["day_segments"]]) @@ -11,89 +11,6 @@ sensor_output <- snakemake@output[[1]] timezone_periods <- snakemake@params[["timezone_periods"]] fixed_timezone <- snakemake@params[["fixed_timezone"]] -assign_to_day_segment <- function(data, day_segments, day_segments_type, fixed_timezone){ - - if(day_segments_type == "FREQUENCY_EVERY_DAY"){ - data <- data %>% mutate(local_date_time_obj = lubridate::parse_date_time(local_time, orders = c("HMS", "HM"))) - day_segments <- day_segments %>% mutate(start_time = lubridate::parse_date_time(start_time, orders = c("HMS", "HM")), - end_time = start_time + minutes(length)) - - # Create a new column for each day_segment - for(row_id in 1:nrow(day_segments)){ - row = day_segments[row_id,] - data <- data %>% mutate(!!paste("local_day_segment", row_id, sep = "_") := ifelse(local_date_time_obj >= row$start_time & local_date_time_obj < row$end_time, - paste0("[", - row$label, "_", - local_date, "_", - paste(str_pad(hour(row$start_time),2, pad="0"), str_pad(minute(row$start_time),2, pad="0"), str_pad(second(row$start_time),2, pad="0"),sep =":"), - "]"), NA)) - } - - } else if (day_segments_type == "INTERVAL_EVERY_DAY"){ - - data_dates <- data %>% select(local_date) %>% distinct(local_date) - inferred_day_segments <- crossing(day_segments, data_dates) %>% - mutate(start_local_date_time_obj = lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = fixed_timezone), - end_local_date_time_obj = start_local_date_time_obj + lubridate::period(length), - date_time_interval = lubridate::interval(start_local_date_time_obj, end_local_date_time_obj)) %>% - group_by(label, local_date) %>% - mutate(group_start_datetime = lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = fixed_timezone), - group_end_datetime = group_start_datetime + lubridate::period(length), - group_start_datetime = min(group_start_datetime), - group_end_datetime = max(group_end_datetime)) %>% - ungroup() - - - data <- data %>% mutate(local_date_time_obj = lubridate::ymd_hms(local_date_time, tz = fixed_timezone)) - - # Create a new column for each day_segment - for(row_id in 1:nrow(inferred_day_segments)){ - row = inferred_day_segments[row_id,] - data <- data %>% mutate(!!paste("local_day_segment", row_id, sep = "_") := ifelse(local_date_time_obj %within% row$date_time_interval, - paste0("[", - paste(sep= "#", - row$label, - lubridate::date(row$group_start_datetime), - paste(str_pad(hour(row$group_start_datetime),2, pad="0"), str_pad(minute(row$group_start_datetime),2, pad="0"), str_pad(second(row$group_start_datetime),2, pad="0"),sep =":"), - lubridate::date(row$group_end_datetime), - paste(str_pad(hour(row$group_end_datetime),2, pad="0"), str_pad(minute(row$group_end_datetime),2, pad="0"), str_pad(second(row$group_end_datetime),2, pad="0"),sep =":") - ), - "]"), NA)) - } - - - } else if ( day_segments_type == "INTERVAL_FLEXIBLE_DAY"){ - data <- data %>% mutate(local_date_time_obj = lubridate::ymd_hms(local_date_time, tz = fixed_timezone)) - day_segments <- day_segments %>% mutate(shift = ifelse(shift == "0", "0seconds", shift), - start_local_date_time_obj = lubridate::ymd_hms(start_date_time, tz = fixed_timezone) + (lubridate::period(shift) * ifelse(shift_direction >= 0, 1, -1)), - end_local_date_time_obj = start_local_date_time_obj + lubridate::period(length), - date_time_interval = lubridate::interval(start_local_date_time_obj, end_local_date_time_obj)) - - # Create a new column for each day_segment - for(row_id in 1:nrow(day_segments)){ - row = day_segments[row_id,] - print(row$length) - data <- data %>% mutate(!!paste("local_day_segment", row_id, sep = "_") := ifelse(local_date_time_obj %within% row$date_time_interval, - paste0("[", - paste(sep= "#", - row$label, - lubridate::date(row$start_local_date_time_obj), - paste(str_pad(hour(row$start_local_date_time_obj),2, pad="0"), str_pad(minute(row$start_local_date_time_obj),2, pad="0"), str_pad(second(row$start_local_date_time_obj),2, pad="0"),sep =":"), - lubridate::date(row$end_local_date_time_obj), - paste(str_pad(hour(row$end_local_date_time_obj),2, pad="0"), str_pad(minute(row$end_local_date_time_obj),2, pad="0"), str_pad(second(row$end_local_date_time_obj),2, pad="0"),sep =":") - ), - "]"), NA)) - } - } - - # Join all day_segments in a single column - data <- data %>% - unite("assigned_segments", starts_with("local_day_segment"), sep = "|", na.rm = TRUE) %>% - select(-local_date_time_obj) - - return(data) -} - split_local_date_time <- function(data, day_segments){ split_data <- data %>% separate(local_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>% diff --git a/src/data/resample_fused_location.R b/src/data/resample_fused_location.R deleted file mode 100644 index ccff8a05..00000000 --- a/src/data/resample_fused_location.R +++ /dev/null @@ -1,59 +0,0 @@ -source("renv/activate.R") - -library(dplyr) -library(readr) -library(tidyr) - -bin_size <- snakemake@params[["bin_size"]] -timezone <- snakemake@params[["timezone"]] -consecutive_threshold <- snakemake@params[["consecutive_threshold"]] -time_since_valid_location <- snakemake@params[["time_since_valid_location"]] - -locations <- read_csv(snakemake@input[["locations"]], col_types = cols()) %>% filter(provider == "fused") -phone_sensed_bins <- read_csv(snakemake@input[["phone_sensed_bins"]], col_types = cols(local_date = col_character())) - -if(nrow(locations) > 0){ - sensed_minute_bins <- phone_sensed_bins %>% - pivot_longer(-local_date, names_to = c("hour", "bin"), names_sep = "_", values_to = "sensor_count") %>% - mutate(hour = as.integer(hour), bin = as.integer(bin)) %>% - complete(nesting(local_date, hour), bin = seq(0, 59,1)) %>% - fill(sensor_count) %>% - mutate(timestamp = as.numeric(as.POSIXct(paste0(local_date, " ", hour,":", bin,":00"), format = "%Y-%m-%d %H:%M:%S", tz = timezone)) * 1000 ) %>% - filter(sensor_count > 0) %>% - select(timestamp) - - resampled_locations <- locations %>% - bind_rows(sensed_minute_bins) %>% - mutate(provider = replace_na(provider, "resampled")) %>% - arrange(timestamp) %>% - # We group and therefore, fill in, missing rows that appear after a valid fused location record and exist - # within consecutive_threshold minutes from each other - mutate(consecutive_time_diff = c(1, diff(timestamp)), - resample_group = cumsum(!is.na(double_longitude) | consecutive_time_diff > (1000 * 60 * consecutive_threshold))) %>% - group_by(resample_group) %>% - # drop rows that are logged after time_since_valid_location minutes from the last valid fused location - filter((timestamp - first(timestamp) < (1000 * 60 * time_since_valid_location))) %>% - fill(-timestamp, -resample_group) %>% - select(-consecutive_time_diff) %>% - drop_na(double_longitude, double_latitude, accuracy) %>% - # Add local date_time - mutate(utc_date_time = as.POSIXct(timestamp/1000, origin="1970-01-01", tz="UTC"), - local_date_time = format(utc_date_time, tz = timezone, usetz = F)) %>% - separate(local_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>% - separate(local_time, c("local_hour", "local_minute"), ":", remove = FALSE, extra = "drop") %>% - mutate(local_hour = as.numeric(local_hour), - local_minute = as.numeric(local_minute), - local_day_segment = case_when(local_hour %in% 0:5 ~ "night", - local_hour %in% 6:11 ~ "morning", - local_hour %in% 12:17 ~ "afternoon", - local_hour %in% 18:23 ~ "evening")) %>% - # Delete resampled rows that exist in the same minute as other original (fused) rows - group_by(local_date, local_hour, local_minute) %>% - mutate(n = n()) %>% - filter(n == 1 | (n > 1 & provider == "fused")) %>% - select(-n) - - write.csv(resampled_locations,snakemake@output[[1]], row.names = F) -} else { - write.csv(locations,snakemake@output[[1]], row.names = F) -} diff --git a/src/features/join_features_from_providers.R b/src/features/join_features_from_providers.R new file mode 100644 index 00000000..888a77e9 --- /dev/null +++ b/src/features/join_features_from_providers.R @@ -0,0 +1,14 @@ +source("renv/activate.R") + +library("tidyr") +library("dplyr") + +location_features_files <- snakemake@input[["location_features"]] +location_features <- setNames(data.frame(matrix(ncol = 1, nrow = 0)), c("local_segment")) + + +for(location_features_file in location_features_files){ + location_features <- merge(location_features, read.csv(location_features_file), all = TRUE) +} + +write.csv(location_features, snakemake@output[[1]], row.names = FALSE) \ No newline at end of file diff --git a/src/features/location_barnett/AvgFlightDur.R b/src/features/location/barnett/library/AvgFlightDur.R similarity index 100% rename from src/features/location_barnett/AvgFlightDur.R rename to src/features/location/barnett/library/AvgFlightDur.R diff --git a/src/features/location_barnett/AvgFlightLen.R b/src/features/location/barnett/library/AvgFlightLen.R similarity index 100% rename from src/features/location_barnett/AvgFlightLen.R rename to src/features/location/barnett/library/AvgFlightLen.R diff --git a/src/features/location_barnett/Collapse2Pause.R b/src/features/location/barnett/library/Collapse2Pause.R similarity index 100% rename from src/features/location_barnett/Collapse2Pause.R rename to src/features/location/barnett/library/Collapse2Pause.R diff --git a/src/features/location_barnett/DailyMobilityPlots.R b/src/features/location/barnett/library/DailyMobilityPlots.R similarity index 100% rename from src/features/location_barnett/DailyMobilityPlots.R rename to src/features/location/barnett/library/DailyMobilityPlots.R diff --git a/src/features/location_barnett/DailyRoutineIndex.R b/src/features/location/barnett/library/DailyRoutineIndex.R similarity index 100% rename from src/features/location_barnett/DailyRoutineIndex.R rename to src/features/location/barnett/library/DailyRoutineIndex.R diff --git a/src/features/location_barnett/DayDist.R b/src/features/location/barnett/library/DayDist.R similarity index 100% rename from src/features/location_barnett/DayDist.R rename to src/features/location/barnett/library/DayDist.R diff --git a/src/features/location_barnett/DistanceTravelled.R b/src/features/location/barnett/library/DistanceTravelled.R similarity index 100% rename from src/features/location_barnett/DistanceTravelled.R rename to src/features/location/barnett/library/DistanceTravelled.R diff --git a/src/features/location_barnett/ExtractFlights.R b/src/features/location/barnett/library/ExtractFlights.R similarity index 100% rename from src/features/location_barnett/ExtractFlights.R rename to src/features/location/barnett/library/ExtractFlights.R diff --git a/src/features/location_barnett/ExtractTimePeriod.R b/src/features/location/barnett/library/ExtractTimePeriod.R similarity index 100% rename from src/features/location_barnett/ExtractTimePeriod.R rename to src/features/location/barnett/library/ExtractTimePeriod.R diff --git a/src/features/location_barnett/GPS2MobMat.R b/src/features/location/barnett/library/GPS2MobMat.R similarity index 100% rename from src/features/location_barnett/GPS2MobMat.R rename to src/features/location/barnett/library/GPS2MobMat.R diff --git a/src/features/location_barnett/GPSmobility-internal.R b/src/features/location/barnett/library/GPSmobility-internal.R similarity index 100% rename from src/features/location_barnett/GPSmobility-internal.R rename to src/features/location/barnett/library/GPSmobility-internal.R diff --git a/src/features/location_barnett/GetMobilityFeaturesMat.R b/src/features/location/barnett/library/GetMobilityFeaturesMat.R similarity index 100% rename from src/features/location_barnett/GetMobilityFeaturesMat.R rename to src/features/location/barnett/library/GetMobilityFeaturesMat.R diff --git a/src/features/location_barnett/GuessPause.R b/src/features/location/barnett/library/GuessPause.R similarity index 100% rename from src/features/location_barnett/GuessPause.R rename to src/features/location/barnett/library/GuessPause.R diff --git a/src/features/location_barnett/Hometime.R b/src/features/location/barnett/library/Hometime.R similarity index 100% rename from src/features/location_barnett/Hometime.R rename to src/features/location/barnett/library/Hometime.R diff --git a/src/features/location_barnett/InitializeParams.R b/src/features/location/barnett/library/InitializeParams.R similarity index 100% rename from src/features/location_barnett/InitializeParams.R rename to src/features/location/barnett/library/InitializeParams.R diff --git a/src/features/location_barnett/IsFlight.R b/src/features/location/barnett/library/IsFlight.R similarity index 100% rename from src/features/location_barnett/IsFlight.R rename to src/features/location/barnett/library/IsFlight.R diff --git a/src/features/location_barnett/LatLong2XY.R b/src/features/location/barnett/library/LatLong2XY.R similarity index 100% rename from src/features/location_barnett/LatLong2XY.R rename to src/features/location/barnett/library/LatLong2XY.R diff --git a/src/features/location_barnett/LocationAt.R b/src/features/location/barnett/library/LocationAt.R similarity index 100% rename from src/features/location_barnett/LocationAt.R rename to src/features/location/barnett/library/LocationAt.R diff --git a/src/features/location_barnett/MaxDiam.R b/src/features/location/barnett/library/MaxDiam.R similarity index 100% rename from src/features/location_barnett/MaxDiam.R rename to src/features/location/barnett/library/MaxDiam.R diff --git a/src/features/location_barnett/MaxDistBetweenTrajectories.R b/src/features/location/barnett/library/MaxDistBetweenTrajectories.R similarity index 100% rename from src/features/location_barnett/MaxDistBetweenTrajectories.R rename to src/features/location/barnett/library/MaxDistBetweenTrajectories.R diff --git a/src/features/location_barnett/MaxHomeDist.R b/src/features/location/barnett/library/MaxHomeDist.R similarity index 100% rename from src/features/location_barnett/MaxHomeDist.R rename to src/features/location/barnett/library/MaxHomeDist.R diff --git a/src/features/location_barnett/MaxRadius.R b/src/features/location/barnett/library/MaxRadius.R similarity index 100% rename from src/features/location_barnett/MaxRadius.R rename to src/features/location/barnett/library/MaxRadius.R diff --git a/src/features/location_barnett/MinsMissing.R b/src/features/location/barnett/library/MinsMissing.R similarity index 100% rename from src/features/location_barnett/MinsMissing.R rename to src/features/location/barnett/library/MinsMissing.R diff --git a/src/features/location_barnett/MobilityFeatures.R b/src/features/location/barnett/library/MobilityFeatures.R similarity index 100% rename from src/features/location_barnett/MobilityFeatures.R rename to src/features/location/barnett/library/MobilityFeatures.R diff --git a/src/features/location_barnett/MobmatQualityOK.R b/src/features/location/barnett/library/MobmatQualityOK.R similarity index 100% rename from src/features/location_barnett/MobmatQualityOK.R rename to src/features/location/barnett/library/MobmatQualityOK.R diff --git a/src/features/location_barnett/ProbPause.R b/src/features/location/barnett/library/ProbPause.R similarity index 100% rename from src/features/location_barnett/ProbPause.R rename to src/features/location/barnett/library/ProbPause.R diff --git a/src/features/location_barnett/ProgressBar.R b/src/features/location/barnett/library/ProgressBar.R similarity index 100% rename from src/features/location_barnett/ProgressBar.R rename to src/features/location/barnett/library/ProgressBar.R diff --git a/src/features/location_barnett/RadiusOfGyration.R b/src/features/location/barnett/library/RadiusOfGyration.R similarity index 100% rename from src/features/location_barnett/RadiusOfGyration.R rename to src/features/location/barnett/library/RadiusOfGyration.R diff --git a/src/features/location_barnett/RandomBridge.R b/src/features/location/barnett/library/RandomBridge.R similarity index 100% rename from src/features/location_barnett/RandomBridge.R rename to src/features/location/barnett/library/RandomBridge.R diff --git a/src/features/location_barnett/SigLocEntropy.R b/src/features/location/barnett/library/SigLocEntropy.R similarity index 100% rename from src/features/location_barnett/SigLocEntropy.R rename to src/features/location/barnett/library/SigLocEntropy.R diff --git a/src/features/location_barnett/SigLocs.R b/src/features/location/barnett/library/SigLocs.R similarity index 100% rename from src/features/location_barnett/SigLocs.R rename to src/features/location/barnett/library/SigLocs.R diff --git a/src/features/location_barnett/SigLocsVisited.R b/src/features/location/barnett/library/SigLocsVisited.R similarity index 100% rename from src/features/location_barnett/SigLocsVisited.R rename to src/features/location/barnett/library/SigLocsVisited.R diff --git a/src/features/location_barnett/SimulateMobilityGaps.R b/src/features/location/barnett/library/SimulateMobilityGaps.R similarity index 100% rename from src/features/location_barnett/SimulateMobilityGaps.R rename to src/features/location/barnett/library/SimulateMobilityGaps.R diff --git a/src/features/location_barnett/StdFlightDur.R b/src/features/location/barnett/library/StdFlightDur.R similarity index 100% rename from src/features/location_barnett/StdFlightDur.R rename to src/features/location/barnett/library/StdFlightDur.R diff --git a/src/features/location_barnett/StdFlightLen.R b/src/features/location/barnett/library/StdFlightLen.R similarity index 100% rename from src/features/location_barnett/StdFlightLen.R rename to src/features/location/barnett/library/StdFlightLen.R diff --git a/src/features/location_barnett/WriteSurveyAnswers2File.R b/src/features/location/barnett/library/WriteSurveyAnswers2File.R similarity index 100% rename from src/features/location_barnett/WriteSurveyAnswers2File.R rename to src/features/location/barnett/library/WriteSurveyAnswers2File.R diff --git a/src/features/location_barnett/plot.flights.R b/src/features/location/barnett/library/plot.flights.R similarity index 100% rename from src/features/location_barnett/plot.flights.R rename to src/features/location/barnett/library/plot.flights.R diff --git a/src/features/location_barnett/plotlimits.R b/src/features/location/barnett/library/plotlimits.R similarity index 100% rename from src/features/location_barnett/plotlimits.R rename to src/features/location/barnett/library/plotlimits.R diff --git a/src/features/location/barnett/main.R b/src/features/location/barnett/main.R new file mode 100644 index 00000000..54dc9628 --- /dev/null +++ b/src/features/location/barnett/main.R @@ -0,0 +1,96 @@ +source("renv/activate.R") +library("dplyr") +library("stringr") + +# Load Ian Barnett's code. Taken from https://scholar.harvard.edu/ibarnett/software/gpsmobility +file.sources = list.files(c("src/features/location/barnett/library"), pattern="*.R$", full.names=TRUE, ignore.case=TRUE) +sapply(file.sources,source,.GlobalEnv) + +create_empty_file <- function(requested_features){ + return(data.frame(local_segment= character(), + locations_barnett_hometime= numeric(), + locations_barnett_disttravelled= numeric(), + locations_barnett_rog= numeric(), + locations_barnett_maxdiam= numeric(), + locations_barnett_maxhomedist= numeric(), + locations_barnett_siglocsvisited= numeric(), + locations_barnett_avgflightlen= numeric(), + locations_barnett_stdflightlen= numeric(), + locations_barnett_avgflightdur= numeric(), + locations_barnett_stdflightdur= numeric(), + locations_barnett_probpause= numeric(), + locations_barnett_siglocentropy= numeric(), + locations_barnett_minsmissing= numeric(), + locations_barnett_circdnrtn= numeric(), + locations_barnett_wkenddayrtn= numeric(), + locations_barnett_minutes_data_used= numeric() + ) %>% select(all_of(requested_features))) +} + +barnett_location_features <- function(location_data, day_segment, params){ + location_features <- NULL + location <- location_data + accuracy_limit <- params[["ACCURACY_LIMIT"]] + timezone <- params[["TIMEZONE"]] + minutes_data_used <- params[["MINUTES_DATA_USED"]] + + # Compute what features were requested + available_features <- c("hometime","disttravelled","rog","maxdiam", "maxhomedist","siglocsvisited","avgflightlen", "stdflightlen", + "avgflightdur","stdflightdur", "probpause","siglocentropy","minsmissing", "circdnrtn","wkenddayrtn") + requested_features <- intersect(unlist(params["FEATURES"], use.names = F), available_features) + requested_features <- c("local_segment", paste("locations_barnett", requested_features, sep = "_")) + if(minutes_data_used) + requested_features <- c(requested_features, "locations_barnett_minutes_data_used") + + # Excludes datasets with less than 24 hours of data + if(max(location$timestamp) - min(location$timestamp) < 86400000) + location <- head(location, 0) + + if (nrow(location) > 1){ + # Filter by segment and skipping any non-daily segment + location <- location %>% filter_data_by_segment(day_segment) + segment <- location %>% head(1) %>% pull(local_segment) + segment_data <- str_split(segment, "#")[[1]] + if(segment_data[[2]] != segment_data[[4]] || segment_data[[3]] != "00:00:00" || segment_data[[5]] != "23:59:59"){ + warning(paste("Barnett's location features cannot be computed for day segmentes that are not daily (cover 00:00:00 to 23:59:59 of every day). Skipping for ", segment)) + location_features <- create_empty_file(requested_features) + } else { + # Count how many minutes of data we use to get location features + # Some minutes have multiple fused rows + location_minutes_used <- location %>% + group_by(local_date, local_hour) %>% + summarise(n_minutes = n_distinct(local_minute)) %>% + group_by(local_date) %>% + summarise(locations_barnett_minutes_data_used = sum(n_minutes)) %>% + select(local_date, locations_barnett_minutes_data_used) + + # Save day segment to attach it later + location_dates_segments <- location %>% select(local_date, local_segment) %>% distinct(local_date, .keep_all = TRUE) + + # Select only the columns that the algorithm needs + location <- location %>% select(timestamp, latitude = double_latitude, longitude = double_longitude, altitude = double_altitude, accuracy) + outputMobility <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone) + + if(is.null(outputMobility)){ + location_features <- create_empty_file(requested_features) + } else{ + # Copy index (dates) as a column + features <- cbind(rownames(outputMobility$featavg), outputMobility$featavg) + features <- as.data.frame(features) + features[-1] <- lapply(lapply(features[-1], as.character), as.numeric) + colnames(features)=c("local_date",tolower(paste("locations_barnett", colnames(outputMobility$featavg), sep = "_"))) + # Add the minute count column + features <- left_join(features, location_minutes_used, by = "local_date") + # Add the day segment column for consistency + features <- left_join(features, location_dates_segments, by = "local_date") + location_features <- features %>% select(all_of(requested_features)) + } + } + } else { + location_features <- create_empty_file(requested_features) + } + + if(ncol(location_features) != length(requested_features)) + stop(paste0("The number of features in the output dataframe (=", ncol(location_features),") does not match the expected value (=", length(requested_features),"). Verify your barnett location features")) + return(location_features) +} \ No newline at end of file diff --git a/src/features/location_doryab/location_base.py b/src/features/location/doryab/main.py similarity index 69% rename from src/features/location_doryab/location_base.py rename to src/features/location/doryab/main.py index fad096e8..cffb2597 100644 --- a/src/features/location_doryab/location_base.py +++ b/src/features/location/doryab/main.py @@ -4,21 +4,32 @@ from astropy.timeseries import LombScargle from sklearn.cluster import DBSCAN from math import radians, cos, sin, asin, sqrt -def base_location_features(location_data, day_segment, requested_features, dbscan_eps, dbscan_minsamples, threshold_static, maximum_gap_allowed,sampling_frequency): +def doryab_location_features(location_data, day_segment, params, filter_data_by_segment, *args, **kwargs): + requested_features = params["FEATURES"] + dbscan_eps = params["DBSCAN_EPS"] + dbscan_minsamples = params["DBSCAN_MINSAMPLES"] + threshold_static = params["THRESHOLD_STATIC"] + maximum_gap_allowed = params["MAXIMUM_GAP_ALLOWED"] + sampling_frequency = params["SAMPLING_FREQUENCY"] + + minutes_data_used = params["MINUTES_DATA_USED"] + if(minutes_data_used): + requested_features.append("minutesdataused") + # name of the features this function can compute base_features_names = ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","minutesdataused"] # the subset of requested features this function can compute features_to_compute = list(set(requested_features) & set(base_features_names)) + if location_data.empty: - location_features = pd.DataFrame(columns=["local_date"] + ["location_" + day_segment + "_" + x for x in features_to_compute]) + location_features = pd.DataFrame(columns=["local_segment"] + ["locations_doryab_" + x for x in features_to_compute]) else: - if day_segment != "daily": - location_data = location_data[location_data["local_day_segment"] == day_segment] + location_data = filter_data_by_segment(location_data, day_segment) if location_data.empty: - location_features = pd.DataFrame(columns=["local_date"] + ["location_" + day_segment + "_" + x for x in features_to_compute]) + location_features = pd.DataFrame(columns=["local_segment"] + ["locations_doryab_" + x for x in features_to_compute]) else: location_features = pd.DataFrame() @@ -26,108 +37,108 @@ def base_location_features(location_data, day_segment, requested_features, dbsca sampling_frequency = getSamplingFrequency(location_data) if "minutesdataused" in features_to_compute: - for localDate in location_data["local_date"].unique(): - location_features.loc[localDate,"location_" + day_segment + "_minutesdataused"] = getMinutesData(location_data[location_data["local_date"]==localDate]) + for localDate in location_data["local_segment"].unique(): + location_features.loc[localDate,"locations_doryab_minutesdataused"] = getMinutesData(location_data[location_data["local_segment"]==localDate]) - location_features.index.name = 'local_date' + location_features.index.name = 'local_segment' location_data = location_data[(location_data['double_latitude']!=0.0) & (location_data['double_longitude']!=0.0)] if "locationvariance" in features_to_compute: - location_features["location_" + day_segment + "_locationvariance"] = location_data.groupby(['local_date'])['double_latitude'].var() + location_data.groupby(['local_date'])['double_longitude'].var() + location_features["locations_doryab_locationvariance"] = location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var() if "loglocationvariance" in features_to_compute: - location_features["location_" + day_segment + "_loglocationvariance"] = (location_data.groupby(['local_date'])['double_latitude'].var() + location_data.groupby(['local_date'])['double_longitude'].var()).apply(lambda x: np.log10(x) if x > 0 else None) + location_features["locations_doryab_loglocationvariance"] = (location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var()).apply(lambda x: np.log10(x) if x > 0 else None) preComputedDistanceandSpeed = pd.DataFrame() - for localDate in location_data['local_date'].unique(): - distance, speeddf = get_all_travel_distances_meters_speed(location_data[location_data['local_date']==localDate],threshold_static,maximum_gap_allowed) + for localDate in location_data['local_segment'].unique(): + distance, speeddf = get_all_travel_distances_meters_speed(location_data[location_data['local_segment']==localDate],threshold_static,maximum_gap_allowed) preComputedDistanceandSpeed.loc[localDate,"distance"] = distance.sum() preComputedDistanceandSpeed.loc[localDate,"avgspeed"] = speeddf[speeddf['speedTag'] == 'Moving']['speed'].mean() preComputedDistanceandSpeed.loc[localDate,"varspeed"] = speeddf[speeddf['speedTag'] == 'Moving']['speed'].var() if "totaldistance" in features_to_compute: - for localDate in location_data['local_date'].unique(): - location_features.loc[localDate,"location_" + day_segment + "_totaldistance"] = preComputedDistanceandSpeed.loc[localDate,"distance"] + for localDate in location_data['local_segment'].unique(): + location_features.loc[localDate,"locations_doryab_totaldistance"] = preComputedDistanceandSpeed.loc[localDate,"distance"] if "averagespeed" in features_to_compute: - for localDate in location_data['local_date'].unique(): - location_features.loc[localDate,"location_" + day_segment + "_averagespeed"] = preComputedDistanceandSpeed.loc[localDate,"avgspeed"] + for localDate in location_data['local_segment'].unique(): + location_features.loc[localDate,"locations_doryab_averagespeed"] = preComputedDistanceandSpeed.loc[localDate,"avgspeed"] if "varspeed" in features_to_compute: - for localDate in location_data['local_date'].unique(): - location_features.loc[localDate,"location_" + day_segment + "_varspeed"] = preComputedDistanceandSpeed.loc[localDate,"varspeed"] + for localDate in location_data['local_segment'].unique(): + location_features.loc[localDate,"locations_doryab_varspeed"] = preComputedDistanceandSpeed.loc[localDate,"varspeed"] if "circadianmovement" in features_to_compute: - for localDate in location_data['local_date'].unique(): - location_features.loc[localDate,"location_" + day_segment + "_circadianmovement"] = circadian_movement(location_data[location_data['local_date']==localDate]) + for localDate in location_data['local_segment'].unique(): + location_features.loc[localDate,"locations_doryab_circadianmovement"] = circadian_movement(location_data[location_data['local_segment']==localDate]) newLocationData = cluster_and_label(location_data, eps= distance_to_degrees(dbscan_eps), min_samples=dbscan_minsamples) if "numberofsignificantplaces" in features_to_compute: - for localDate in newLocationData['local_date'].unique(): - location_features.loc[localDate,"location_" + day_segment + "_numberofsignificantplaces"] = number_of_significant_places(newLocationData[newLocationData['local_date']==localDate]) + for localDate in newLocationData['local_segment'].unique(): + location_features.loc[localDate,"locations_doryab_numberofsignificantplaces"] = number_of_significant_places(newLocationData[newLocationData['local_segment']==localDate]) if "numberlocationtransitions" in features_to_compute: - for localDate in newLocationData['local_date'].unique(): - location_features.loc[localDate,"location_" + day_segment + "_numberlocationtransitions"] = number_location_transitions(newLocationData[newLocationData['local_date']==localDate]) + for localDate in newLocationData['local_segment'].unique(): + location_features.loc[localDate,"locations_doryab_numberlocationtransitions"] = number_location_transitions(newLocationData[newLocationData['local_segment']==localDate]) if "radiusgyration" in features_to_compute: - for localDate in newLocationData['local_date'].unique(): - location_features.loc[localDate,"location_" + day_segment + "_radiusgyration"] = radius_of_gyration(newLocationData[newLocationData['local_date']==localDate],sampling_frequency) + for localDate in newLocationData['local_segment'].unique(): + location_features.loc[localDate,"locations_doryab_radiusgyration"] = radius_of_gyration(newLocationData[newLocationData['local_segment']==localDate],sampling_frequency) if "timeattop1location" in features_to_compute: - for localDate in newLocationData['local_date'].unique(): - location_features.loc[localDate,"location_" + day_segment + "_timeattop1"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_date']==localDate],1,sampling_frequency) + for localDate in newLocationData['local_segment'].unique(): + location_features.loc[localDate,"locations_doryab_timeattop1"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_segment']==localDate],1,sampling_frequency) if "timeattop2location" in features_to_compute: - for localDate in newLocationData['local_date'].unique(): - location_features.loc[localDate,"location_" + day_segment + "_timeattop2"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_date']==localDate],2,sampling_frequency) + for localDate in newLocationData['local_segment'].unique(): + location_features.loc[localDate,"locations_doryab_timeattop2"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_segment']==localDate],2,sampling_frequency) if "timeattop3location" in features_to_compute: - for localDate in newLocationData['local_date'].unique(): - location_features.loc[localDate,"location_" + day_segment + "_timeattop3"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_date']==localDate],3,sampling_frequency) + for localDate in newLocationData['local_segment'].unique(): + location_features.loc[localDate,"locations_doryab_timeattop3"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_segment']==localDate],3,sampling_frequency) if "movingtostaticratio" in features_to_compute: - for localDate in newLocationData['local_date'].unique(): - location_features.loc[localDate,"location_" + day_segment + "_movingtostaticratio"] = (newLocationData[newLocationData['local_date']==localDate].shape[0]*sampling_frequency) / (location_data[location_data['local_date']==localDate].shape[0] * sampling_frequency) + for localDate in newLocationData['local_segment'].unique(): + location_features.loc[localDate,"locations_doryab_movingtostaticratio"] = (newLocationData[newLocationData['local_segment']==localDate].shape[0]*sampling_frequency) / (location_data[location_data['local_segment']==localDate].shape[0] * sampling_frequency) if "outlierstimepercent" in features_to_compute: - for localDate in newLocationData['local_date'].unique(): - location_features.loc[localDate,"location_" + day_segment + "_outlierstimepercent"] = outliers_time_percent(newLocationData[newLocationData['local_date']==localDate],sampling_frequency) + for localDate in newLocationData['local_segment'].unique(): + location_features.loc[localDate,"locations_doryab_outlierstimepercent"] = outliers_time_percent(newLocationData[newLocationData['local_segment']==localDate],sampling_frequency) preComputedmaxminCluster = pd.DataFrame() - for localDate in newLocationData['local_date'].unique(): - smax, smin, sstd,smean = len_stay_at_clusters_in_minutes(newLocationData[newLocationData['local_date']==localDate],sampling_frequency) - preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_maxlengthstayatclusters"] = smax - preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_minlengthstayatclusters"] = smin - preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_stdlengthstayatclusters"] = sstd - preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_meanlengthstayatclusters"] = smean + for localDate in newLocationData['local_segment'].unique(): + smax, smin, sstd,smean = len_stay_at_clusters_in_minutes(newLocationData[newLocationData['local_segment']==localDate],sampling_frequency) + preComputedmaxminCluster.loc[localDate,"locations_doryab_maxlengthstayatclusters"] = smax + preComputedmaxminCluster.loc[localDate,"locations_doryab_minlengthstayatclusters"] = smin + preComputedmaxminCluster.loc[localDate,"locations_doryab_stdlengthstayatclusters"] = sstd + preComputedmaxminCluster.loc[localDate,"locations_doryab_meanlengthstayatclusters"] = smean if "maxlengthstayatclusters" in features_to_compute: - for localDate in newLocationData['local_date'].unique(): - location_features.loc[localDate,"location_" + day_segment + "_maxlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_maxlengthstayatclusters"] + for localDate in newLocationData['local_segment'].unique(): + location_features.loc[localDate,"locations_doryab_maxlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"locations_doryab_maxlengthstayatclusters"] if "minlengthstayatclusters" in features_to_compute: - for localDate in newLocationData['local_date'].unique(): - location_features.loc[localDate,"location_" + day_segment + "_minlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_minlengthstayatclusters"] + for localDate in newLocationData['local_segment'].unique(): + location_features.loc[localDate,"locations_doryab_minlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"locations_doryab_minlengthstayatclusters"] if "stdlengthstayatclusters" in features_to_compute: - for localDate in newLocationData['local_date'].unique(): - location_features.loc[localDate,"location_" + day_segment + "_stdlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_stdlengthstayatclusters"] + for localDate in newLocationData['local_segment'].unique(): + location_features.loc[localDate,"locations_doryab_stdlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"locations_doryab_stdlengthstayatclusters"] if "meanlengthstayatclusters" in features_to_compute: - for localDate in newLocationData['local_date'].unique(): - location_features.loc[localDate,"location_" + day_segment + "_meanlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_meanlengthstayatclusters"] + for localDate in newLocationData['local_segment'].unique(): + location_features.loc[localDate,"locations_doryab_meanlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"locations_doryab_meanlengthstayatclusters"] if "locationentropy" in features_to_compute: - for localDate in newLocationData['local_date'].unique(): - location_features.loc[localDate,"location_" + day_segment + "_locationentropy"] = location_entropy(newLocationData[newLocationData['local_date']==localDate]) + for localDate in newLocationData['local_segment'].unique(): + location_features.loc[localDate,"locations_doryab_locationentropy"] = location_entropy(newLocationData[newLocationData['local_segment']==localDate]) if "normalizedlocationentropy" in features_to_compute: - for localDate in newLocationData['local_date'].unique(): - location_features.loc[localDate,"location_" + day_segment + "_normalizedlocationentropy"] = location_entropy_normalized(newLocationData[newLocationData['local_date']==localDate]) + for localDate in newLocationData['local_segment'].unique(): + location_features.loc[localDate,"locations_doryab_normalizedlocationentropy"] = location_entropy_normalized(newLocationData[newLocationData['local_segment']==localDate]) location_features = location_features.reset_index() diff --git a/src/features/location/locations_entry.R b/src/features/location/locations_entry.R new file mode 100644 index 00000000..b2349a78 --- /dev/null +++ b/src/features/location/locations_entry.R @@ -0,0 +1,44 @@ +source("renv/activate.R") +source("src/features/utils/utils.R") +library("dplyr") +library("stringr") +library("tidyr") + +location_data <- read.csv(snakemake@input[["location_data"]], stringsAsFactors = FALSE) +day_segments_labels <- read.csv(snakemake@input[["day_segments_labels"]], stringsAsFactors = FALSE) +provider <- snakemake@params["provider"][["provider"]] +provider_key <- snakemake@params["provider_key"] + +location_features <- data.frame(local_segment = character(), stringsAsFactors = FALSE) + +if(!"FEATURES" %in% names(provider)) + stop(paste0("Provider config[LOCATION][PROVIDERS][", provider_key,"] is missing a FEATURES attribute in config.yaml")) + +if(provider[["COMPUTE"]] == TRUE){ + code_path <- paste0("src/features/location/", provider[["SRC_FOLDER"]], "/main.R") + source(code_path) + features_function <- match.fun(paste0(provider[["SRC_FOLDER"]], "_location_features")) + day_segments <- day_segments_labels %>% pull(label) + for (day_segment in day_segments){ + print(paste(rapids_log_tag,"Processing", provider_key, day_segment)) + + features <- features_function(location_data, day_segment, provider) + + # Check all features names contain the provider key so they are unique + features_names <- colnames(features %>% select(-local_segment)) + if(any(!grepl(paste0(".*(",str_to_lower(provider_key),").*"), features_names))) + stop(paste("The name of all location features of", provider_key," must contain its name in lower case but the following don't [", paste(features_names[!grepl(paste0(".*(",str_to_lower(provider_key),").*"), features_names)], collapse = ", "), "]")) + + location_features <- merge(location_features, features, all = TRUE) + } +} else { + for(feature in provider[["FEATURES"]]) + location_features[,feature] <- NA +} + +location_features <- location_features %>% separate(col = local_segment, + into = c("local_segment_label", "local_start_date", "local_start_time", "local_end_date", "local_end_time"), + sep = "#", + remove = FALSE) + +write.csv(location_features, snakemake@output[[1]], row.names = FALSE) diff --git a/src/features/location/locations_entry.py b/src/features/location/locations_entry.py new file mode 100644 index 00000000..c47fdf5a --- /dev/null +++ b/src/features/location/locations_entry.py @@ -0,0 +1,39 @@ +import pandas as pd +from importlib import import_module, util +from pathlib import Path + +# import filter_data_by_segment from src/features/utils/utils.py +spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py")) +mod = util.module_from_spec(spec) +spec.loader.exec_module(mod) +filter_data_by_segment = getattr(mod, "filter_data_by_segment") +rapids_log_tag = getattr(mod, "rapids_log_tag") + +location_data = pd.read_csv(snakemake.input["location_data"][0]) +day_segments_labels = pd.read_csv(snakemake.input["day_segments_labels"], header=0) +mypath = snakemake.params["mypath"] +provider = snakemake.params["provider"] +provider_key = snakemake.params["provider_key"] +location_features = pd.DataFrame(columns=["local_segment"]) + +if "FEATURES" not in provider: + raise ValueError("Provider config[LOCATION][PROVIDERS][{}] is missing a FEATURES attribute in config.yaml".format(provider_key)) + +if provider["COMPUTE"] == True: + code_path = provider["SRC_FOLDER"] + ".main" + feature_module = import_module(code_path) + feature_function = getattr(feature_module, provider["SRC_FOLDER"] + "_location_features") + + for day_segment in day_segments_labels["label"]: + print("{} Processing {} {}".format(rapids_log_tag, provider_key, day_segment)) + features = feature_function(location_data, day_segment, provider, filter_data_by_segment=filter_data_by_segment) + location_features = location_features.merge(features, how="outer") +else: + for feature in provider["FEATURES"]: + location_features[feature] = None + +segment_colums = pd.DataFrame() +segment_colums[["local_segment_label", "local_start_date", "local_start_time", "local_end_date", "local_end_time"]] = location_features["local_segment"].str.split(pat="#", expand=True) +for i in range(segment_colums.shape[1]): + location_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]]) +location_features.to_csv(snakemake.output[0], index=False) \ No newline at end of file diff --git a/src/features/location_barnett_features.R b/src/features/location_barnett_features.R deleted file mode 100644 index 94e99593..00000000 --- a/src/features/location_barnett_features.R +++ /dev/null @@ -1,89 +0,0 @@ -source("renv/activate.R") -# Load Ian Barnett's code. Taken from https://scholar.harvard.edu/ibarnett/software/gpsmobility -file.sources = list.files(c("src/features/location_barnett"), pattern="*.R$", full.names=TRUE, ignore.case=TRUE) -sapply(file.sources,source,.GlobalEnv) - -library(dplyr) - -write_empty_file <- function(file_path, requested_features){ - write.csv(data.frame(local_date= character(), - location_barnett_hometime= numeric(), - location_barnett_disttravelled= numeric(), - location_barnett_rog= numeric(), - location_barnett_maxdiam= numeric(), - location_barnett_maxhomedist= numeric(), - location_barnett_siglocsvisited= numeric(), - location_barnett_avgflightlen= numeric(), - location_barnett_stdflightlen= numeric(), - location_barnett_avgflightdur= numeric(), - location_barnett_stdflightdur= numeric(), - location_barnett_probpause= numeric(), - location_barnett_siglocentropy= numeric(), - location_barnett_minsmissing= numeric(), - location_barnett_circdnrtn= numeric(), - location_barnett_wkenddayrtn= numeric(), - minutes_data_used= numeric() - ) %>% select(requested_features), file_path, row.names = F) -} - -location <- read.csv(snakemake@input[["locations"]], stringsAsFactors = F) -# The choice between RESAMPLE_FUSED and the original location data happens at the rule level in the function -# optional_location_input in features.snakefile -locations_to_use <- snakemake@params[["locations_to_use"]] -accuracy_limit <- snakemake@params[["accuracy_limit"]] -timezone <- snakemake@params[["timezone"]] -minutes_data_used <- snakemake@params[["minutes_data_used"]] -requested_features <- intersect(unlist(snakemake@params["features"], use.names = F), - c("hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","minsmissing","circdnrtn","wkenddayrtn")) -requested_features <- c("local_date", paste("location_barnett", requested_features, sep = "_")) -if(minutes_data_used) - requested_features <- c(requested_features, "minutes_data_used") - -if(!locations_to_use %in% c("ALL_EXCEPT_FUSED", "RESAMPLE_FUSED", "ALL")){ - print("Unkown filter, provide one of the following three: ALL, ALL_EXCEPT_FUSED, or RESAMPLE_FUSED") - quit(save = "no", status = 1, runLast = FALSE) -} - - # excludes fused and resample -if(locations_to_use == "ALL_EXCEPT_FUSED") - location <- location %>% filter(provider == "gps") - -# Remove 0,0 location coordinates -location <- location %>% filter(double_latitude != 0 & double_longitude != 0) - -# Excludes datasets with less than 24 hours of data -if(max(location$timestamp) - min(location$timestamp) < 86400000) - location <- head(location, 0) - -if (nrow(location) > 1){ - - # Count how many minutes of data we use to get location features - # Some minutes have multiple fused rows - location_minutes_used <- location %>% - group_by(local_date, local_hour) %>% - summarise(n_minutes = n_distinct(local_minute)) %>% - group_by(local_date) %>% - summarise(minutes_data_used = sum(n_minutes)) %>% - select(local_date, minutes_data_used) - - location <- location %>% - select(timestamp, latitude = double_latitude, longitude = double_longitude, altitude = double_altitude, accuracy) - - outputMobility <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone) - - if(is.null(outputMobility)){ - write_empty_file(snakemake@output[[1]], requested_features) - } else{ - # Copy index (dates) as a column - features <- cbind(rownames(outputMobility$featavg), outputMobility$featavg) - features <- as.data.frame(features) - features[-1] <- lapply(lapply(features[-1], as.character), as.numeric) - colnames(features)=c("local_date",tolower(paste("location_barnett", colnames(outputMobility$featavg), sep = "_"))) - # Add the minute count column - features <- left_join(features, location_minutes_used, by = "local_date") - write.csv(features %>% select(requested_features), snakemake@output[[1]], row.names = F) - } - -} else { - write_empty_file(snakemake@output[[1]], requested_features) -} diff --git a/src/features/location_doryab_features.py b/src/features/location_doryab_features.py deleted file mode 100644 index 8d749483..00000000 --- a/src/features/location_doryab_features.py +++ /dev/null @@ -1,24 +0,0 @@ -import pandas as pd -from location_doryab.location_base import base_location_features - -location_data = pd.read_csv(snakemake.input[0], parse_dates=["local_date_time", "local_date"]) -day_segment = snakemake.params["day_segment"] -requested_features = snakemake.params["features"] -location_features = pd.DataFrame(columns=["local_date"]) -dbscan_eps = snakemake.params["dbscan_eps"] -dbscan_minsamples = snakemake.params["dbscan_minsamples"] -threshold_static = snakemake.params["threshold_static"] -maximum_gap_allowed = snakemake.params["maximum_gap_allowed"] -minutes_data_used = snakemake.params["minutes_data_used"] -sampling_frequency = snakemake.params["sampling_frequency"] - -if(minutes_data_used): - requested_features.append("minutesdataused") - -base_features = base_location_features(location_data, day_segment, requested_features, dbscan_eps, dbscan_minsamples,threshold_static,maximum_gap_allowed,sampling_frequency) - -location_features = location_features.merge(base_features, on="local_date", how="outer") - -assert len(requested_features) + 1 == location_features.shape[1], "The number of features in the output dataframe (=" + str(location_features.shape[1]) + ") does not match the expected value (=" + str(len(requested_features)) + " + 1). Verify your location feature extraction functions" - -location_features.to_csv(snakemake.output[0], index=False) \ No newline at end of file diff --git a/src/features/utils/utils.R b/src/features/utils/utils.R new file mode 100644 index 00000000..544a0655 --- /dev/null +++ b/src/features/utils/utils.R @@ -0,0 +1,12 @@ +library("stringr") +filter_data_by_segment <- function(data, day_segment){ + # Filter the rows that belong to day_segment, and put the segment full name in a new column for grouping + date_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2}" + hour_regex = "[0-9]{2}:[0-9]{2}:[0-9]{2}" + data <- data %>% + filter(grepl(paste0("\\[", day_segment, "#"), assigned_segments)) %>% + mutate(local_segment = str_extract(assigned_segments, paste0("\\[", day_segment, "#", date_regex, "#", hour_regex, "#", date_regex, "#", hour_regex, "\\]")), + local_segment = str_sub(local_segment, 2, -2)) # get rid of first and last character([]) + return(data) +} +rapids_log_tag <- "RAPIDS:" \ No newline at end of file diff --git a/src/features/utils/utils.py b/src/features/utils/utils.py new file mode 100644 index 00000000..a0736a78 --- /dev/null +++ b/src/features/utils/utils.py @@ -0,0 +1,9 @@ + +def filter_data_by_segment(data, day_segment): + date_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2}" + hour_regex = "[0-9]{2}:[0-9]{2}:[0-9]{2}" + segment_regex = "\[({}#{}#{}#{}#{})\]".format(day_segment, date_regex, hour_regex, date_regex, hour_regex) + data["local_segment"] = data["assigned_segments"].str.extract(segment_regex, expand=True) + return(data.dropna(subset = ["local_segment"])) + +rapids_log_tag = "RAPIDS:" \ No newline at end of file