From 24bf62a7ab3e4a7d0d201b8af4703b09f9d4e8d8 Mon Sep 17 00:00:00 2001 From: JulioV Date: Mon, 19 Oct 2020 15:07:12 -0400 Subject: [PATCH] Update file names --- Snakefile | 220 ++++++------- config.yaml | 129 ++++---- rules/common.smk | 51 +-- rules/features.smk | 302 +++++------------- rules/preprocessing.smk | 73 ++--- ...tion_genres.R => application_categories.R} | 0 src/data/assign_to_day_segment.R | 231 +++++++------- src/data/download_dataset.R | 44 ++- src/data/unify_ios_android.R | 3 +- src/data/unify_utils.R | 48 ++- .../panda/main.py | 0 .../rapids/main.py | 0 .../episodes/activity_recognition_episodes.R | 0 .../rapids/main.py | 0 .../rapids/main.py | 0 .../episodes/battery_episodes.R | 0 .../{battery => phone_battery}/rapids/main.py | 0 .../rapids/main.R | 0 .../{calls => phone_calls}/rapids/main.R | 0 .../rapids/main.py | 0 .../{light => phone_light}/rapids/main.py | 0 .../barnett/library/AvgFlightDur.R | 0 .../barnett/library/AvgFlightLen.R | 0 .../barnett/library/Collapse2Pause.R | 0 .../barnett/library/DailyMobilityPlots.R | 0 .../barnett/library/DailyRoutineIndex.R | 0 .../barnett/library/DayDist.R | 0 .../barnett/library/DistanceTravelled.R | 0 .../barnett/library/ExtractFlights.R | 0 .../barnett/library/ExtractTimePeriod.R | 0 .../barnett/library/GPS2MobMat.R | 0 .../barnett/library/GPSmobility-internal.R | 0 .../barnett/library/GetMobilityFeaturesMat.R | 0 .../barnett/library/GuessPause.R | 0 .../barnett/library/Hometime.R | 0 .../barnett/library/InitializeParams.R | 0 .../barnett/library/IsFlight.R | 0 .../barnett/library/LatLong2XY.R | 0 .../barnett/library/LocationAt.R | 0 .../barnett/library/MaxDiam.R | 0 .../library/MaxDistBetweenTrajectories.R | 0 .../barnett/library/MaxHomeDist.R | 0 .../barnett/library/MaxRadius.R | 0 .../barnett/library/MinsMissing.R | 0 .../barnett/library/MobilityFeatures.R | 0 .../barnett/library/MobmatQualityOK.R | 0 .../barnett/library/ProbPause.R | 0 .../barnett/library/ProgressBar.R | 0 .../barnett/library/RadiusOfGyration.R | 0 .../barnett/library/RandomBridge.R | 0 .../barnett/library/SigLocEntropy.R | 0 .../barnett/library/SigLocs.R | 0 .../barnett/library/SigLocsVisited.R | 0 .../barnett/library/SimulateMobilityGaps.R | 0 .../barnett/library/StdFlightDur.R | 0 .../barnett/library/StdFlightLen.R | 0 .../barnett/library/WriteSurveyAnswers2File.R | 0 .../barnett/library/plot.flights.R | 0 .../barnett/library/plotlimits.R | 0 .../barnett/main.R | 13 +- .../doryab/main.py | 0 .../rapids/main.R | 0 .../episodes/screen_episodes.R | 0 .../{screen => phone_screen}/rapids/main.py | 0 .../rapids/main.R | 0 src/features/phone_wifi_visible/rapids/main.R | 46 +++ src/features/utils/resample_episodes.R | 2 + src/features/utils/utils.py | 2 +- 68 files changed, 495 insertions(+), 669 deletions(-) rename src/data/{application_genres.R => application_categories.R} (100%) rename src/features/{accelerometer => phone_accelerometer}/panda/main.py (100%) rename src/features/{accelerometer => phone_accelerometer}/rapids/main.py (100%) rename src/features/{activity_recognition => phone_activity_recognition}/episodes/activity_recognition_episodes.R (100%) rename src/features/{activity_recognition => phone_activity_recognition}/rapids/main.py (100%) rename src/features/{applications_foreground => phone_applications_foreground}/rapids/main.py (100%) rename src/features/{battery => phone_battery}/episodes/battery_episodes.R (100%) rename src/features/{battery => phone_battery}/rapids/main.py (100%) rename src/features/{bluetooth => phone_bluetooth}/rapids/main.R (100%) rename src/features/{calls => phone_calls}/rapids/main.R (100%) rename src/features/{conversation => phone_conversation}/rapids/main.py (100%) rename src/features/{light => phone_light}/rapids/main.py (100%) rename src/features/{locations => phone_locations}/barnett/library/AvgFlightDur.R (100%) rename src/features/{locations => phone_locations}/barnett/library/AvgFlightLen.R (100%) rename src/features/{locations => phone_locations}/barnett/library/Collapse2Pause.R (100%) rename src/features/{locations => phone_locations}/barnett/library/DailyMobilityPlots.R (100%) rename src/features/{locations => phone_locations}/barnett/library/DailyRoutineIndex.R (100%) rename src/features/{locations => phone_locations}/barnett/library/DayDist.R (100%) rename src/features/{locations => phone_locations}/barnett/library/DistanceTravelled.R (100%) rename src/features/{locations => phone_locations}/barnett/library/ExtractFlights.R (100%) rename src/features/{locations => phone_locations}/barnett/library/ExtractTimePeriod.R (100%) rename src/features/{locations => phone_locations}/barnett/library/GPS2MobMat.R (100%) rename src/features/{locations => phone_locations}/barnett/library/GPSmobility-internal.R (100%) rename src/features/{locations => phone_locations}/barnett/library/GetMobilityFeaturesMat.R (100%) rename src/features/{locations => phone_locations}/barnett/library/GuessPause.R (100%) rename src/features/{locations => phone_locations}/barnett/library/Hometime.R (100%) rename src/features/{locations => phone_locations}/barnett/library/InitializeParams.R (100%) rename src/features/{locations => phone_locations}/barnett/library/IsFlight.R (100%) rename src/features/{locations => phone_locations}/barnett/library/LatLong2XY.R (100%) rename src/features/{locations => phone_locations}/barnett/library/LocationAt.R (100%) rename src/features/{locations => phone_locations}/barnett/library/MaxDiam.R (100%) rename src/features/{locations => phone_locations}/barnett/library/MaxDistBetweenTrajectories.R (100%) rename src/features/{locations => phone_locations}/barnett/library/MaxHomeDist.R (100%) rename src/features/{locations => phone_locations}/barnett/library/MaxRadius.R (100%) rename src/features/{locations => phone_locations}/barnett/library/MinsMissing.R (100%) rename src/features/{locations => phone_locations}/barnett/library/MobilityFeatures.R (100%) rename src/features/{locations => phone_locations}/barnett/library/MobmatQualityOK.R (100%) rename src/features/{locations => phone_locations}/barnett/library/ProbPause.R (100%) rename src/features/{locations => phone_locations}/barnett/library/ProgressBar.R (100%) rename src/features/{locations => phone_locations}/barnett/library/RadiusOfGyration.R (100%) rename src/features/{locations => phone_locations}/barnett/library/RandomBridge.R (100%) rename src/features/{locations => phone_locations}/barnett/library/SigLocEntropy.R (100%) rename src/features/{locations => phone_locations}/barnett/library/SigLocs.R (100%) rename src/features/{locations => phone_locations}/barnett/library/SigLocsVisited.R (100%) rename src/features/{locations => phone_locations}/barnett/library/SimulateMobilityGaps.R (100%) rename src/features/{locations => phone_locations}/barnett/library/StdFlightDur.R (100%) rename src/features/{locations => phone_locations}/barnett/library/StdFlightLen.R (100%) rename src/features/{locations => phone_locations}/barnett/library/WriteSurveyAnswers2File.R (100%) rename src/features/{locations => phone_locations}/barnett/library/plot.flights.R (100%) rename src/features/{locations => phone_locations}/barnett/library/plotlimits.R (100%) rename src/features/{locations => phone_locations}/barnett/main.R (89%) rename src/features/{locations => phone_locations}/doryab/main.py (100%) rename src/features/{messages => phone_messages}/rapids/main.R (100%) rename src/features/{screen => phone_screen}/episodes/screen_episodes.R (100%) rename src/features/{screen => phone_screen}/rapids/main.py (100%) rename src/features/{wifi => phone_wifi_connected}/rapids/main.R (100%) create mode 100644 src/features/phone_wifi_visible/rapids/main.R diff --git a/Snakefile b/Snakefile index e95d14ab..30f1c64d 100644 --- a/Snakefile +++ b/Snakefile @@ -13,17 +13,11 @@ if len(config["PIDS"]) == 0: raise ValueError("Add participants IDs to PIDS in config.yaml. Remember to create their participant files in data/external") if config["PHONE_VALID_SENSED_BINS"]["COMPUTE"] or config["PHONE_VALID_SENSED_DAYS"]["COMPUTE"]: # valid sensed bins is necessary for sensed days, so we add these files anyways if sensed days are requested - if len(config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]) == 0: - raise ValueError("If you want to compute PHONE_VALID_SENSED_BINS or PHONE_VALID_SENSED_DAYS, you need to add at least one table to [PHONE_VALID_SENSED_BINS][DB_TABLES] in config.yaml") + if len(config["PHONE_VALID_SENSED_BINS"]["PHONE_SENSORS"]) == 0: + raise ValueError("If you want to compute PHONE_VALID_SENSED_BINS or PHONE_VALID_SENSED_DAYS, you need to add at least one PHONE_SENSOR to [PHONE_VALID_SENSED_BINS][PHONE_SENSORS] in config.yaml") - pids_android = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "android", config["PIDS"])) - pids_ios = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "ios", config["PIDS"])) - tables_android = [table for table in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]] # for android, discard any ios tables that may exist - tables_ios = [table for table in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"]]] # for ios, discard any android tables that may exist - - for pids,table in zip([pids_android, pids_ios], [tables_android, tables_ios]): - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=pids, sensor=table)) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=pids, sensor=table)) + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=map(str.lower, config["PHONE_VALID_SENSED_BINS"]["PHONE_SENSORS"]))) + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=map(str.lower, config["PHONE_VALID_SENSED_BINS"]["PHONE_SENSORS"]))) files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_timestamps.csv", pid=config["PIDS"])) @@ -33,106 +27,100 @@ if config["PHONE_VALID_SENSED_DAYS"]["COMPUTE"]: min_valid_hours_per_day=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_HOURS_PER_DAY"], min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"])) -for provider in config["MESSAGES"]["PROVIDERS"].keys(): - if config["MESSAGES"]["PROVIDERS"][provider]["COMPUTE"]: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["MESSAGES"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["MESSAGES"]["DB_TABLE"])) - files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["MESSAGES"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="MESSAGES".lower())) - files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="MESSAGES".lower())) +for provider in config["PHONE_MESSAGES"]["PROVIDERS"].keys(): + if config["PHONE_MESSAGES"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_messages_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_messages_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_messages_features/phone_messages_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_MESSAGES"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_messages.csv", pid=config["PIDS"])) -for provider in config["CALLS"]["PROVIDERS"].keys(): - if config["CALLS"]["PROVIDERS"][provider]["COMPUTE"]: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"])) - files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["CALLS"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="CALLS".lower())) - files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="CALLS".lower())) +for provider in config["PHONE_CALLS"]["PROVIDERS"].keys(): + if config["PHONE_CALLS"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_calls_raw.csv", pid=config["PIDS"], sensor=config["PHONE_CALLS"]["DB_TABLE"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime.csv", pid=config["PIDS"], sensor=config["PHONE_CALLS"]["DB_TABLE"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["PHONE_CALLS"]["DB_TABLE"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_calls_features/phone_calls_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_CALLS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_calls.csv", pid=config["PIDS"])) -for provider in config["BLUETOOTH"]["PROVIDERS"].keys(): - if config["BLUETOOTH"]["PROVIDERS"][provider]["COMPUTE"]: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["BLUETOOTH"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["BLUETOOTH"]["DB_TABLE"])) - files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["BLUETOOTH"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="BLUETOOTH".lower())) - files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="BLUETOOTH".lower())) +for provider in config["PHONE_BLUETOOTH"]["PROVIDERS"].keys(): + if config["PHONE_BLUETOOTH"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_bluetooth_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_bluetooth_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_bluetooth_features/phone_bluetooth_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_BLUETOOTH"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_bluetooth.csv", pid=config["PIDS"])) -for provider in config["ACTIVITY_RECOGNITION"]["PROVIDERS"].keys(): - if config["ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["COMPUTE"]: - pids_android = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "android", config["PIDS"])) - pids_ios = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "ios", config["PIDS"])) - - for pids,table in zip([pids_android, pids_ios], [config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]): - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=pids, sensor=table)) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=pids, sensor=table)) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=pids, sensor=table)) - - files_to_compute.extend(expand("data/interim/{pid}/activity_recognition_episodes.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/interim/{pid}/activity_recognition_episodes_resampled.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/interim/{pid}/activity_recognition_episodes_resampled_with_datetime.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="ACTIVITY_RECOGNITION".lower())) - files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="ACTIVITY_RECOGNITION".lower())) +for provider in config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"].keys(): + if config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_activity_recognition_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_activity_recognition_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_activity_recognition_with_datetime_unified.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_activity_recognition_episodes.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_activity_recognition_episodes_resampled.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_activity_recognition_episodes_resampled_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_activity_recognition_features/phone_activity_recognition_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_activity_recognition.csv", pid=config["PIDS"])) -for provider in config["BATTERY"]["PROVIDERS"].keys(): - if config["BATTERY"]["PROVIDERS"][provider]["COMPUTE"]: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["BATTERY"]["DB_TABLE"])) - files_to_compute.extend(expand("data/interim/{pid}/battery_episodes.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/interim/{pid}/battery_episodes_resampled.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/interim/{pid}/battery_episodes_resampled_with_datetime.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["SCREEN"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="BATTERY".lower())) - files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="BATTERY".lower())) +for provider in config["PHONE_BATTERY"]["PROVIDERS"].keys(): + if config["PHONE_BATTERY"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_battery_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_battery_episodes.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_battery_episodes_resampled.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_battery_episodes_resampled_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_battery_features/phone_battery_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_BATTERY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_battery.csv", pid=config["PIDS"])) -for provider in config["SCREEN"]["PROVIDERS"].keys(): - if config["SCREEN"]["PROVIDERS"][provider]["COMPUTE"]: - if config["SCREEN"]["DB_TABLE"] in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]: +for provider in config["PHONE_SCREEN"]["PROVIDERS"].keys(): + if config["PHONE_SCREEN"]["PROVIDERS"][provider]["COMPUTE"]: + if "PHONE_SCREEN" in config["PHONE_VALID_SENSED_BINS"]["PHONE_SENSORS"]: files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"])) else: - raise ValueError("Error: Add your screen table (and as many sensor tables as you have) to [PHONE_VALID_SENSED_BINS][DB_TABLES] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data)") - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["SCREEN"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["SCREEN"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["SCREEN"]["DB_TABLE"])) - files_to_compute.extend(expand("data/interim/{pid}/screen_episodes.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/interim/{pid}/screen_episodes_resampled.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/interim/{pid}/screen_episodes_resampled_with_datetime.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["SCREEN"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="SCREEN".lower())) - files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="SCREEN".lower())) + raise ValueError("Error: Add PHONE_SCREEN (and as many phone sensor as you have in your database) to [PHONE_VALID_SENSED_BINS][PHONE_SENSORS] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data)") + files_to_compute.extend(expand("data/raw/{pid}/phone_screen_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_screen_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_screen_with_datetime_unified.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_screen_episodes.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_screen_episodes_resampled.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_screen_episodes_resampled_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_screen_features/phone_screen_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_SCREEN"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_screen.csv", pid=config["PIDS"])) -for provider in config["LIGHT"]["PROVIDERS"].keys(): - if config["LIGHT"]["PROVIDERS"][provider]["COMPUTE"]: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["LIGHT"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["LIGHT"]["DB_TABLE"])) - files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["LIGHT"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="LIGHT".lower())) - files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="LIGHT".lower())) +for provider in config["PHONE_LIGHT"]["PROVIDERS"].keys(): + if config["PHONE_LIGHT"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_light_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_light_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_light_features/phone_light_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_LIGHT"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_light.csv", pid=config["PIDS"],)) -for provider in config["ACCELEROMETER"]["PROVIDERS"].keys(): - if config["ACCELEROMETER"]["PROVIDERS"][provider]["COMPUTE"]: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["ACCELEROMETER"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["ACCELEROMETER"]["DB_TABLE"])) - files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["ACCELEROMETER"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="ACCELEROMETER".lower())) - files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="ACCELEROMETER".lower())) +for provider in config["PHONE_ACCELEROMETER"]["PROVIDERS"].keys(): + if config["PHONE_ACCELEROMETER"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_accelerometer_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_accelerometer_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_accelerometer_features/phone_accelerometer_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_ACCELEROMETER"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_accelerometer.csv", pid=config["PIDS"])) -for provider in config["APPLICATIONS_FOREGROUND"]["PROVIDERS"].keys(): - if config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][provider]["COMPUTE"]: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_with_genre.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])) - files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="APPLICATIONS_FOREGROUND".lower())) - files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="APPLICATIONS_FOREGROUND".lower())) +for provider in config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"].keys(): + if config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_applications_foreground_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_applications_foreground_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_applications_foreground_with_datetime_with_categories.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_applications_foreground_features/phone_applications_foreground_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_applications_foreground.csv", pid=config["PIDS"])) -for provider in config["WIFI"]["PROVIDERS"].keys(): - if config["WIFI"]["PROVIDERS"][provider]["COMPUTE"]: - if len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor_key}_with_datetime_visibleandconnected.csv", pid=config["PIDS"], sensor_key="WIFI".lower())) - files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["WIFI"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="WIFI".lower())) - files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="WIFI".lower())) - if len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) > 0: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor_key}_with_datetime_visibleandconnected.csv", pid=config["PIDS"], sensor_key="WIFI".lower())) - files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["WIFI"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="WIFI".lower())) - files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="WIFI".lower())) +for provider in config["PHONE_WIFI_VISIBLE"]["PROVIDERS"].keys(): + if config["PHONE_WIFI_VISIBLE"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_wifi_visible_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_wifi_visible_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_wifi_visible_features/phone_wifi_visible_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_WIFI_VISIBLE"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_wifi_visible.csv", pid=config["PIDS"])) + +for provider in config["PHONE_WIFI_CONNECTED"]["PROVIDERS"].keys(): + if config["PHONE_WIFI_CONNECTED"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_wifi_connected_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_wifi_connected_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_wifi_connected_features/phone_wifi_connected_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_WIFI_CONNECTED"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_wifi_connected.csv", pid=config["PIDS"])) if config["HEARTRATE"]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["HEARTRATE"]["DB_TABLE"])) @@ -151,31 +139,27 @@ if config["SLEEP"]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_{fitbit_data_type}_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["intraday", "summary"])) files_to_compute.extend(expand("data/processed/{pid}/fitbit_sleep_{day_segment}.csv", pid = config["PIDS"], day_segment = config["SLEEP"]["DAY_SEGMENTS"])) -for provider in config["CONVERSATION"]["PROVIDERS"].keys(): - if config["CONVERSATION"]["PROVIDERS"][provider]["COMPUTE"]: - pids_android = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "android", config["PIDS"])) - pids_ios = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "ios", config["PIDS"])) - for pids,table in zip([pids_android, pids_ios], [config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["CONVERSATION"]["DB_TABLE"]["IOS"]]): - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=pids, sensor=table)) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=pids, sensor=table)) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=pids, sensor=table)) - files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["CONVERSATION"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="CONVERSATION".lower())) - files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="CONVERSATION".lower())) +for provider in config["PHONE_CONVERSATION"]["PROVIDERS"].keys(): + if config["PHONE_CONVERSATION"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_conversation_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_conversation_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_conversation_with_datetime_unified.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_conversation_features/phone_conversation_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_CONVERSATION"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_conversation.csv", pid=config["PIDS"])) -for provider in config["LOCATIONS"]["PROVIDERS"].keys(): - if config["LOCATIONS"]["PROVIDERS"][provider]["COMPUTE"]: - if config["LOCATIONS"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED": - if config["LOCATIONS"]["DB_TABLE"] in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]: +for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys(): + if config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["COMPUTE"]: + if config["PHONE_LOCATIONS"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED": + if config["PHONE_LOCATIONS"]["DB_TABLE"] in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]: files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"])) else: raise ValueError("Error: Add your locations table (and as many sensor tables as you have) to [PHONE_VALID_SENSED_BINS][DB_TABLES] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)") - - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["LOCATIONS"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["LOCATIONS"]["DB_TABLE"])) - files_to_compute.extend(expand("data/interim/{pid}/{sensor}_processed_{locations_to_use}.csv", pid=config["PIDS"], sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"])) - files_to_compute.extend(expand("data/interim/{pid}/{sensor}_processed_{locations_to_use}_with_datetime.csv", pid=config["PIDS"], sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"])) - files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["LOCATIONS"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="LOCATIONS".lower())) - files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="LOCATIONS".lower())) + + files_to_compute.extend(expand("data/raw/{pid}/phone_locations_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_locations_features/phone_locations_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_locations.csv", pid=config["PIDS"])) # visualization for data exploration if config["HEATMAP_FEATURES_CORRELATIONS"]["PLOT"]: diff --git a/config.yaml b/config.yaml index 1127e521..4183c644 100644 --- a/config.yaml +++ b/config.yaml @@ -32,9 +32,12 @@ READABLE_DATETIME: PHONE_VALID_SENSED_BINS: COMPUTE: False # This flag is automatically ignored (set to True) if you are extracting PHONE_VALID_SENSED_DAYS or screen or Barnett's location features BIN_SIZE: &bin_size 5 # (in minutes) - # Add as many sensor tables as you have, they all improve the computation of PHONE_VALID_SENSED_BINS and PHONE_VALID_SENSED_DAYS. - # If you are extracting screen or Barnett's location features, screen and locations tables are mandatory. - DB_TABLES: [] + # Add as many PHONE sensors as you have, they all improve the computation of PHONE_VALID_SENSED_BINS and PHONE_VALID_SENSED_DAYS. + # If you are extracting screen or Barnett/Doryab location features, PHONE_SCREEN and PHONE_LOCATIONS tables are mandatory. + # You can choose any of the keys shown below, just make sure its DB_TABLE exists in your database! + # PHONE_MESSAGES, PHONE_CALLS, PHONE_LOCATIONS, PHONE_BLUETOOTH, PHONE_ACTIVITY_RECOGNITION, PHONE_BATTERY, PHONE_SCREEN, PHONE_LIGHT, + # PHONE_ACCELEROMETER, PHONE_APPLICATIONS_FOREGROUND, PHONE_WIFI_VISIBLE, PHONE_WIFI_CONNECTED, PHONE_CONVERSATION + PHONE_SENSORS: [] PHONE_VALID_SENSED_DAYS: COMPUTE: False @@ -42,7 +45,7 @@ PHONE_VALID_SENSED_DAYS: MIN_VALID_BINS_PER_HOUR: &min_valid_bins_per_hour [6] # (out of 60min/BIN_SIZE bins) # Communication SMS features config, TYPES and FEATURES keys need to match -MESSAGES: +PHONE_MESSAGES: DB_TABLE: messages PROVIDERS: RAPIDS: @@ -52,10 +55,10 @@ MESSAGES: received: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact] sent: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact] SRC_LANGUAGE: "r" - SRC_FOLDER: "rapids" # inside src/features/messages + SRC_FOLDER: "rapids" # inside src/features/phone_messages # Communication call features config, TYPES and FEATURES keys need to match -CALLS: +PHONE_CALLS: DB_TABLE: calls PROVIDERS: RAPIDS: @@ -66,20 +69,13 @@ CALLS: incoming: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact] outgoing: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact] SRC_LANGUAGE: "r" - SRC_FOLDER: "rapids" # inside src/features/calls + SRC_FOLDER: "rapids" # inside src/features/phone_calls -APPLICATION_GENRES: - CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store) - CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv" - UPDATE_CATALOGUE_FILE: false # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE - SCRAPE_MISSING_GENRES: false # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway - -LOCATIONS: +PHONE_LOCATIONS: DB_TABLE: locations LOCATIONS_TO_USE: FUSED_RESAMPLED # ALL, GPS OR FUSED_RESAMPLED FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row - TIMEZONE: *timezone PROVIDERS: DORYAB: COMPUTE: False @@ -90,7 +86,7 @@ LOCATIONS: MAXIMUM_GAP_ALLOWED: 300 MINUTES_DATA_USED: False SAMPLING_FREQUENCY: 0 - SRC_FOLDER: "doryab" # inside src/features/locations + SRC_FOLDER: "doryab" # inside src/features/phone_locations SRC_LANGUAGE: "python" BARNETT: @@ -99,20 +95,20 @@ LOCATIONS: ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius TIMEZONE: *timezone MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features - SRC_FOLDER: "barnett" # inside src/features/locations + SRC_FOLDER: "barnett" # inside src/features/phone_locations SRC_LANGUAGE: "r" -BLUETOOTH: +PHONE_BLUETOOTH: DB_TABLE: bluetooth PROVIDERS: RAPIDS: COMPUTE: False FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] - SRC_FOLDER: "rapids" # inside src/features/bluetooth + SRC_FOLDER: "rapids" # inside src/features/phone_bluetooth SRC_LANGUAGE: "r" -ACTIVITY_RECOGNITION: +PHONE_ACTIVITY_RECOGNITION: DB_TABLE: ANDROID: plugin_google_activity_recognition IOS: plugin_ios_activity_recognition @@ -124,19 +120,19 @@ ACTIVITY_RECOGNITION: STATIONARY: ["still", "tilting"] MOBILE: ["on_foot", "walking", "running", "on_bicycle"] VEHICLE: ["in_vehicle"] - SRC_FOLDER: "rapids" # inside src/features/activity_recognition + SRC_FOLDER: "rapids" # inside src/features/phone_activity_recognition SRC_LANGUAGE: "python" -BATTERY: +PHONE_BATTERY: DB_TABLE: battery PROVIDERS: RAPIDS: COMPUTE: False FEATURES: ["countdischarge", "sumdurationdischarge", "countcharge", "sumdurationcharge", "avgconsumptionrate", "maxconsumptionrate"] - SRC_FOLDER: "rapids" # inside src/features/battery + SRC_FOLDER: "rapids" # inside src/features/phone_battery SRC_LANGUAGE: "python" -SCREEN: +PHONE_SCREEN: DB_TABLE: screen PROVIDERS: RAPIDS: @@ -146,25 +142,25 @@ SCREEN: IGNORE_EPISODES_LONGER_THAN: 0 # in minutes, set to 0 to disable FEATURES: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration", "firstuseafter"] # "episodepersensedminutes" needs to be added later EPISODE_TYPES: ["unlock"] - SRC_FOLDER: "rapids" # inside src/features/screen + SRC_FOLDER: "rapids" # inside src/features/phone_screen SRC_LANGUAGE: "python" -LIGHT: +PHONE_LIGHT: DB_TABLE: light PROVIDERS: RAPIDS: COMPUTE: False FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"] - SRC_FOLDER: "rapids" # inside src/features/light + SRC_FOLDER: "rapids" # inside src/features/phone_light SRC_LANGUAGE: "python" -ACCELEROMETER: +PHONE_ACCELEROMETER: DB_TABLE: accelerometer PROVIDERS: RAPIDS: COMPUTE: False FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"] - SRC_FOLDER: "rapids" # inside src/features/accelerometer + SRC_FOLDER: "rapids" # inside src/features/phone_accelerometer SRC_LANGUAGE: "python" PANDA: @@ -173,11 +169,16 @@ ACCELEROMETER: FEATURES: exertional_activity_episode: ["sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"] nonexertional_activity_episode: ["sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"] - SRC_FOLDER: "panda" # inside src/features/accelerometer + SRC_FOLDER: "panda" # inside src/features/phone_accelerometer SRC_LANGUAGE: "python" -APPLICATIONS_FOREGROUND: +PHONE_APPLICATIONS_FOREGROUND: DB_TABLE: applications_foreground + APPLICATION_CATEGORIES: + CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store) + CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv" + UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE + SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway PROVIDERS: RAPIDS: COMPUTE: False @@ -189,9 +190,45 @@ APPLICATIONS_FOREGROUND: EXCLUDED_CATEGORIES: [] EXCLUDED_APPS: ["com.fitbit.FitbitMobile", "com.aware.plugin.upmc.cancer"] FEATURES: ["count", "timeoffirstuse", "timeoflastuse", "frequencyentropy"] - SRC_FOLDER: "rapids" # inside src/features/applications_foreground + SRC_FOLDER: "rapids" # inside src/features/phone_applications_foreground SRC_LANGUAGE: "python" +PHONE_WIFI_VISIBLE: + DB_TABLE: "wifi" + PROVIDERS: + RAPIDS: + COMPUTE: False + FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] + SRC_FOLDER: "rapids" # inside src/features/phone_wifi_visible + SRC_LANGUAGE: "r" + +PHONE_WIFI_CONNECTED: + DB_TABLE: "sensor_wifi" + PROVIDERS: + RAPIDS: + COMPUTE: False + FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] + SRC_FOLDER: "rapids" # inside src/features/phone_wifi_connected + SRC_LANGUAGE: "r" + +PHONE_CONVERSATION: + DB_TABLE: + ANDROID: plugin_studentlife_audio_android + IOS: plugin_studentlife_audio + PROVIDERS: + RAPIDS: + COMPUTE: False + FEATURES: ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration", + "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","sumenergy", + "avgenergy","sdenergy","minenergy","maxenergy","silencesensedfraction","noisesensedfraction", + "voicesensedfraction","unknownsensedfraction","silenceexpectedfraction","noiseexpectedfraction","voiceexpectedfraction", + "unknownexpectedfraction","countconversation"] + RECORDING_MINUTES: 1 + PAUSED_MINUTES : 3 + SRC_FOLDER: "rapids" # inside src/features/phone_conversation + SRC_LANGUAGE: "python" + + HEARTRATE: COMPUTE: False DB_TABLE: fitbit_data @@ -223,34 +260,6 @@ SLEEP: SLEEP_TYPES: ["main", "nap", "all"] SUMMARY_FEATURES: ["sumdurationafterwakeup", "sumdurationasleep", "sumdurationawake", "sumdurationtofallasleep", "sumdurationinbed", "avgefficiency", "countepisode"] -WIFI: - DB_TABLE: - VISIBLE_ACCESS_POINTS: "wifi" # if you only have a CONNECTED_ACCESS_POINTS table, set this value to "" - CONNECTED_ACCESS_POINTS: "sensor_wifi" # if you only have a VISIBLE_ACCESS_POINTS table, set this value to "" - PROVIDERS: - RAPIDS: - COMPUTE: False - FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] - SRC_FOLDER: "rapids" # inside src/features/bluetooth - SRC_LANGUAGE: "r" - -CONVERSATION: - DB_TABLE: - ANDROID: plugin_studentlife_audio_android - IOS: plugin_studentlife_audio - PROVIDERS: - RAPIDS: - COMPUTE: False - FEATURES: ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration", - "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","sumenergy", - "avgenergy","sdenergy","minenergy","maxenergy","silencesensedfraction","noisesensedfraction", - "voicesensedfraction","unknownsensedfraction","silenceexpectedfraction","noiseexpectedfraction","voiceexpectedfraction", - "unknownexpectedfraction","countconversation"] - RECORDING_MINUTES: 1 - PAUSED_MINUTES : 3 - SRC_FOLDER: "rapids" # inside src/features/conversation - SRC_LANGUAGE: "python" - ### Visualizations ################################################################ HEATMAP_FEATURES_CORRELATIONS: PLOT: False diff --git a/rules/common.smk b/rules/common.smk index 4e247209..72852b16 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -14,69 +14,20 @@ def infer_participant_platform(participant_file): return platform -# Preprocessing.smk #################################################################################################### - -def optional_phone_sensed_bins_input(wildcards): - platform = infer_participant_platform("data/external/"+wildcards.pid) - - if platform == "android": - tables_platform = [table for table in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]] # for android, discard any ios tables that may exist - elif platform == "ios": - tables_platform = [table for table in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"]]] # for ios, discard any android tables that may exist - - return expand("data/raw/{{pid}}/{table}_with_datetime.csv", table = tables_platform) - -def optional_phone_sensed_timestamps_input(wildcards): - platform = infer_participant_platform("data/external/"+wildcards.pid) - - if platform == "android": - tables_platform = [table for table in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]] # for android, discard any ios tables that may exist - elif platform == "ios": - tables_platform = [table for table in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"]]] # for ios, discard any android tables that may exist - - return expand("data/raw/{{pid}}/{table}_raw.csv", table = tables_platform) - # Features.smk ######################################################################################################### def find_features_files(wildcards): feature_files = [] for provider_key, provider in config[(wildcards.sensor_key).upper()]["PROVIDERS"].items(): if provider["COMPUTE"]: - feature_files.extend(expand("data/interim/{{pid}}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", sensor_key=(wildcards.sensor_key).lower(), language=provider["SRC_LANGUAGE"].lower(), provider_key=provider_key)) + feature_files.extend(expand("data/interim/{{pid}}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", sensor_key=wildcards.sensor_key.lower(), language=provider["SRC_LANGUAGE"].lower(), provider_key=provider_key.lower())) return(feature_files) -def optional_ar_input(wildcards): - platform = infer_participant_platform("data/external/"+wildcards.pid) - - if platform == "android": - return expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"]) - elif platform == "ios": - return expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]) - -def optional_conversation_input(wildcards): - platform = infer_participant_platform("data/external/"+wildcards.pid) - - if platform == "android": - return expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CONVERSATION"]["DB_TABLE"]["ANDROID"])[0] - elif platform == "ios": - return expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CONVERSATION"]["DB_TABLE"]["IOS"])[0] - def optional_steps_sleep_input(wildcards): if config["STEP"]["EXCLUDE_SLEEP"]["EXCLUDE"] == True and config["STEP"]["EXCLUDE_SLEEP"]["TYPE"] == "FITBIT_BASED": return "data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv" else: return [] -def optional_wifi_input(wildcards): - if len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0 and len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) == 0: - return {"visible_access_points": expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"])} - elif len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) == 0 and len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) > 0: - return {"connected_access_points": expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"])} - elif len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0 and len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) > 0: - return {"visible_access_points": expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]), "connected_access_points": expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"])} - else: - raise ValueError("If you are computing WIFI features you need to provide either VISIBLE_ACCESS_POINTS, CONNECTED_ACCESS_POINTS or both") - - # Models.smk ########################################################################################################### def input_merge_features_of_single_participant(wildcards): diff --git a/rules/features.smk b/rules/features.smk index 7945d62d..672c936f 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -28,341 +28,211 @@ rule resample_episodes_with_datetime: script: "../src/data/readable_datetime.R" -rule accelerometer_r_features: +rule phone_accelerometer_python_features: input: - sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["ACCELEROMETER"]["DB_TABLE"])[0], + sensor_data = "data/raw/{pid}/phone_accelerometer_with_datetime.csv", day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" params: - provider = lambda wildcards: config["ACCELEROMETER"]["PROVIDERS"][wildcards.provider_key], + provider = lambda wildcards: config["PHONE_ACCELEROMETER"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", - sensor_key = "accelerometer" + sensor_key = "phone_accelerometer" output: - "data/interim/{pid}/accelerometer_features/accelerometer_r_{provider_key}.csv" - script: - "../src/features/entry.R" - -rule accelerometer_python_features: - input: - sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["ACCELEROMETER"]["DB_TABLE"])[0], - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["ACCELEROMETER"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}", - sensor_key = "accelerometer" - output: - "data/interim/{pid}/accelerometer_features/accelerometer_python_{provider_key}.csv" + "data/interim/{pid}/phone_accelerometer_features/phone_accelerometer_python_{provider_key}.csv" script: "../src/features/entry.py" rule activity_recognition_episodes: input: - optional_ar_input + sensor_data = "data/raw/{pid}/phone_activity_recognition_with_datetime_unified.csv" output: - "data/interim/{pid}/activity_recognition_episodes.csv" + "data/interim/{pid}/phone_activity_recognition_episodes.csv" script: - "../src/features/activity_recognition/episodes/activity_recognition_episodes.R" + "../src/features/phone_activity_recognition/episodes/activity_recognition_episodes.R" -rule activity_recognition_r_features: +rule phone_activity_recognition_python_features: input: - sensor_episodes = "data/interim/{pid}/activity_recognition_episodes_resampled_with_datetime.csv", + sensor_episodes = "data/interim/{pid}/phone_activity_recognition_episodes_resampled_with_datetime.csv", day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" params: - provider = lambda wildcards: config["ACTIVITY_RECOGNITION"]["PROVIDERS"][wildcards.provider_key], + provider = lambda wildcards: config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", - sensor_key = "activity_recognition" + sensor_key = "phone_activity_recognition" output: - "data/interim/{pid}/activity_recognition_features/activity_recognition_r_{provider_key}.csv" - script: - "../src/features/entry.R" - -rule activity_recognition_python_features: - input: - sensor_episodes = "data/interim/{pid}/activity_recognition_episodes_resampled_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["ACTIVITY_RECOGNITION"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}", - sensor_key = "activity_recognition" - output: - "data/interim/{pid}/activity_recognition_features/activity_recognition_python_{provider_key}.csv" + "data/interim/{pid}/phone_activity_recognition_features/phone_activity_recognition_python_{provider_key}.csv" script: "../src/features/entry.py" -rule applications_foreground_r_features: +rule phone_applications_foreground_python_features: input: - sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_with_genre.csv", sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])[0], + sensor_data = "data/raw/{pid}/phone_applications_foreground_with_datetime_with_categories.csv", day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" params: - provider = lambda wildcards: config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][wildcards.provider_key], + provider = lambda wildcards: config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", - sensor_key = "applications_foreground" + sensor_key = "phone_applications_foreground" output: - "data/interim/{pid}/applications_foreground_features/applications_foreground_r_{provider_key}.csv" - script: - "../src/features/entry.R" - -rule applications_foreground_python_features: - input: - sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_with_genre.csv", sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])[0], - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}", - sensor_key = "applications_foreground" - output: - "data/interim/{pid}/applications_foreground_features/applications_foreground_python_{provider_key}.csv" + "data/interim/{pid}/phone_applications_foreground_features/phone_applications_foreground_python_{provider_key}.csv" script: "../src/features/entry.py" rule battery_episodes: input: - expand("data/raw/{{pid}}/{sensor}_raw.csv", sensor=config["BATTERY"]["DB_TABLE"]) + "data/raw/{pid}/phone_battery_raw.csv" output: - "data/interim/{pid}/battery_episodes.csv" + "data/interim/{pid}/phone_battery_episodes.csv" script: - "../src/features/battery/episodes/battery_episodes.R" + "../src/features/phone_battery/episodes/battery_episodes.R" -rule battery_r_features: +rule phone_battery_python_features: input: - sensor_episodes = "data/interim/{pid}/battery_episodes_resampled_with_datetime.csv", + sensor_episodes = "data/interim/{pid}/phone_battery_episodes_resampled_with_datetime.csv", day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" params: - provider = lambda wildcards: config["BATTERY"]["PROVIDERS"][wildcards.provider_key], + provider = lambda wildcards: config["PHONE_BATTERY"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", - sensor_key = "battery" + sensor_key = "phone_battery" output: - "data/interim/{pid}/battery_features/battery_r_{provider_key}.csv" - script: - "../src/features/entry.R" - -rule battery_python_features: - input: - sensor_episodes = "data/interim/{pid}/battery_episodes_resampled_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["BATTERY"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}", - sensor_key = "battery" - output: - "data/interim/{pid}/battery_features/battery_python_{provider_key}.csv" + "data/interim/{pid}/phone_battery_features/phone_battery_python_{provider_key}.csv" script: "../src/features/entry.py" -rule bluetooth_r_features: +rule phone_bluetooth_r_features: input: - sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BLUETOOTH"]["DB_TABLE"])[0], + sensor_data = "data/raw/{pid}/phone_bluetooth_with_datetime.csv", day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" params: - provider = lambda wildcards: config["BLUETOOTH"]["PROVIDERS"][wildcards.provider_key], + provider = lambda wildcards: config["PHONE_BLUETOOTH"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", - sensor_key = "bluetooth" + sensor_key = "phone_bluetooth" output: - "data/interim/{pid}/bluetooth_features/bluetooth_r_{provider_key}.csv" + "data/interim/{pid}/phone_bluetooth_features/phone_bluetooth_r_{provider_key}.csv" script: "../src/features/entry.R" -rule bluetooth_python_features: - input: - sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BLUETOOTH"]["DB_TABLE"])[0], - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["BLUETOOTH"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}", - sensor_key = "bluetooth" - output: - "data/interim/{pid}/bluetooth_features/bluetooth_python_{provider_key}.csv" - script: - "../src/features/entry.py" - rule calls_r_features: input: - sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CALLS"]["DB_TABLE"])[0], + sensor_data = "data/raw/{pid}/phone_calls_with_datetime_unified.csv", day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" params: - provider = lambda wildcards: config["CALLS"]["PROVIDERS"][wildcards.provider_key], + provider = lambda wildcards: config["PHONE_CALLS"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", - sensor_key = "calls" + sensor_key = "phone_calls" output: - "data/interim/{pid}/calls_features/calls_r_{provider_key}.csv" - script: - "../src/features/entry.R" - -rule calls_python_features: - input: - sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CALLS"]["DB_TABLE"])[0], - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["CALLS"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}", - sensor_key = "calls" - output: - "data/interim/{pid}/calls_features/calls_python_{provider_key}.csv" - script: - "../src/features/entry.py" - -rule conversation_r_features: - input: - sensor_data = optional_conversation_input, - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["CONVERSATION"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}", - sensor_key = "conversation" - output: - "data/interim/{pid}/conversation_features/conversation_r_{provider_key}.csv" + "data/interim/{pid}/phone_calls_features/phone_calls_r_{provider_key}.csv" script: "../src/features/entry.R" rule conversation_python_features: input: - sensor_data = optional_conversation_input, + sensor_data = "data/raw/{pid}/phone_conversation_with_datetime_unified.csv", day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" params: - provider = lambda wildcards: config["CONVERSATION"]["PROVIDERS"][wildcards.provider_key], + provider = lambda wildcards: config["PHONE_CONVERSATION"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", - sensor_key = "conversation" + sensor_key = "phone_conversation" output: - "data/interim/{pid}/conversation_features/conversation_python_{provider_key}.csv" + "data/interim/{pid}/phone_conversation_features/phone_conversation_python_{provider_key}.csv" script: "../src/features/entry.py" -rule light_r_features: +rule phone_light_python_features: input: - sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["LIGHT"]["DB_TABLE"])[0], + sensor_data = "data/raw/{pid}/phone_light_with_datetime.csv", day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" params: - provider = lambda wildcards: config["LIGHT"]["PROVIDERS"][wildcards.provider_key], + provider = lambda wildcards: config["PHONE_LIGHT"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", - sensor_key = "light" + sensor_key = "phone_light" output: - "data/interim/{pid}/light_features/light_r_{provider_key}.csv" + "data/interim/{pid}/phone_light_features/phone_light_python_{provider_key}.csv" + script: + "../src/features/entry.py" + +rule phone_locations_r_features: + input: + sensor_data = "data/interim/{pid}/phone_locations_processed_with_datetime.csv", + day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + params: + provider = lambda wildcards: config["PHONE_LOCATIONS"]["PROVIDERS"][wildcards.provider_key.upper()], + provider_key = "{provider_key}", + sensor_key = "phone_locations" + output: + "data/interim/{pid}/phone_locations_features/phone_locations_r_{provider_key}.csv" script: "../src/features/entry.R" -rule light_python_features: +rule phone_locations_python_features: input: - sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["LIGHT"]["DB_TABLE"])[0], + sensor_data = "data/interim/{pid}/phone_locations_processed_with_datetime.csv", day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" params: - provider = lambda wildcards: config["LIGHT"]["PROVIDERS"][wildcards.provider_key], + provider = lambda wildcards: config["PHONE_LOCATIONS"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", - sensor_key = "light" + sensor_key = "phone_locations" output: - "data/interim/{pid}/light_features/light_python_{provider_key}.csv" + "data/interim/{pid}/phone_locations_features/phone_locations_python_{provider_key}.csv" script: "../src/features/entry.py" -rule locations_r_features: +rule phone_messages_r_features: input: - sensor_data = expand("data/interim/{{pid}}/{sensor}_processed_{locations_to_use}_with_datetime.csv", sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"])[0], + sensor_data = "data/raw/{pid}/phone_messages_with_datetime.csv", day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" params: - provider = lambda wildcards: config["LOCATIONS"]["PROVIDERS"][wildcards.provider_key], + provider = lambda wildcards: config["PHONE_MESSAGES"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", - sensor_key = "locations" + sensor_key = "phone_messages" output: - "data/interim/{pid}/locations_features/locations_r_{provider_key}.csv" + "data/interim/{pid}/phone_messages_features/phone_messages_r_{provider_key}.csv" script: "../src/features/entry.R" -rule locations_python_features: - input: - sensor_data = expand("data/interim/{{pid}}/{sensor}_processed_{locations_to_use}_with_datetime.csv", sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"])[0], - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["LOCATIONS"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}", - sensor_key = "locations" - output: - "data/interim/{pid}/locations_features/locations_python_{provider_key}.csv" - script: - "../src/features/entry.py" - -rule messages_r_features: - input: - sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["MESSAGES"]["DB_TABLE"])[0], - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["MESSAGES"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}", - sensor_key = "messages" - output: - "data/interim/{pid}/messages_features/messages_r_{provider_key}.csv" - script: - "../src/features/entry.R" - -rule messages_python_features: - input: - sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["MESSAGES"]["DB_TABLE"])[0], - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["MESSAGES"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}", - sensor_key = "messages" - output: - "data/interim/{pid}/messages_features/messages_python_{provider_key}.csv" - script: - "../src/features/entry.py" - rule screen_episodes: input: - screen = expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["SCREEN"]["DB_TABLE"]) + screen = "data/raw/{pid}/phone_screen_with_datetime_unified.csv" output: - "data/interim/{pid}/screen_episodes.csv" + "data/interim/{pid}/phone_screen_episodes.csv" script: - "../src/features/screen/episodes/screen_episodes.R" + "../src/features/phone_screen/episodes/screen_episodes.R" -rule screen_r_features: +rule phone_screen_python_features: input: - sensor_episodes = "data/interim/{pid}/screen_episodes_resampled_with_datetime.csv", + sensor_episodes = "data/interim/{pid}/phone_screen_episodes_resampled_with_datetime.csv", day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" params: - provider = lambda wildcards: config["SCREEN"]["PROVIDERS"][wildcards.provider_key], + provider = lambda wildcards: config["PHONE_SCREEN"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", - sensor_key = "screen" + sensor_key = "phone_screen" output: - "data/interim/{pid}/screen_features/screen_r_{provider_key}.csv" - script: - "../src/features/entry.R" - -rule screen_python_features: - input: - sensor_episodes = "data/interim/{pid}/screen_episodes_resampled_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["SCREEN"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}", - sensor_key = "screen" - output: - "data/interim/{pid}/screen_features/screen_python_{provider_key}.csv" + "data/interim/{pid}/phone_screen_features/phone_screen_python_{provider_key}.csv" script: "../src/features/entry.py" -rule wifi_r_features: +rule phone_wifi_connected_r_features: input: - sensor_data = expand("data/raw/{{pid}}/{sensor_key}_with_datetime_visibleandconnected.csv", sensor_key="WIFI".lower())[0], + sensor_data = "data/raw/{pid}/phone_wifi_connected_with_datetime.csv", day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" params: - provider = lambda wildcards: config["WIFI"]["PROVIDERS"][wildcards.provider_key], + provider = lambda wildcards: config["PHONE_WIFI_CONNECTED"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", - sensor_key = "wifi" + sensor_key = "phone_wifi_connected" output: - "data/interim/{pid}/wifi_features/wifi_r_{provider_key}.csv" + "data/interim/{pid}/phone_wifi_connected_features/phone_wifi_connected_r_{provider_key}.csv" script: "../src/features/entry.R" -rule wifi_python_features: +rule phone_wifi_visible_r_features: input: - sensor_data = expand("data/raw/{{pid}}/{sensor_key}_with_datetime_visibleandconnected.csv", sensor_key="WIFI".lower())[0], + sensor_data = "data/raw/{pid}/phone_wifi_visible_with_datetime.csv", day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" params: - provider = lambda wildcards: config["WIFI"]["PROVIDERS"][wildcards.provider_key], + provider = lambda wildcards: config["PHONE_WIFI_VISIBLE"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", - sensor_key = "wifi" + sensor_key = "phone_wifi_visible" output: - "data/interim/{pid}/wifi_features/wifi_python_{provider_key}.csv" + "data/interim/{pid}/phone_wifi_visible_features/phone_wifi_visible_r_{provider_key}.csv" script: - "../src/features/entry.py" + "../src/features/entry.R" rule fitbit_heartrate_features: input: diff --git a/rules/preprocessing.smk b/rules/preprocessing.smk index 8e4581c0..4f332d4e 100644 --- a/rules/preprocessing.smk +++ b/rules/preprocessing.smk @@ -29,10 +29,10 @@ rule download_dataset: "data/external/{pid}" params: group = config["DOWNLOAD_DATASET"]["GROUP"], - table = "{sensor}", + sensor = "{sensor}", + table = lambda wildcards: config[str(wildcards.sensor).upper()]["DB_TABLE"], timezone = config["TIMEZONE"], - aware_multiplatform_tables = config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"] + "," + config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"] + "," + config["CONVERSATION"]["DB_TABLE"]["ANDROID"] + "," + config["CONVERSATION"]["DB_TABLE"]["IOS"], - unifiable_sensors = {"calls": config["CALLS"]["DB_TABLE"], "battery": config["BATTERY"]["DB_TABLE"], "screen": config["SCREEN"]["DB_TABLE"], "ios_activity_recognition": config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"], "ios_conversation": config["CONVERSATION"]["DB_TABLE"]["IOS"]} + aware_multiplatform_tables = config["PHONE_ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"] + "," + config["PHONE_ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"] + "," + config["PHONE_CONVERSATION"]["DB_TABLE"]["ANDROID"] + "," + config["PHONE_CONVERSATION"]["DB_TABLE"]["IOS"], output: "data/raw/{pid}/{sensor}_raw.csv" script: @@ -50,35 +50,23 @@ rule compute_day_segments: script: "../src/data/compute_day_segments.py" -PHONE_SENSORS = [] -PHONE_SENSORS.extend([config["MESSAGES"]["DB_TABLE"], config["CALLS"]["DB_TABLE"], config["LOCATIONS"]["DB_TABLE"], config["BLUETOOTH"]["DB_TABLE"], config["BATTERY"]["DB_TABLE"], config["SCREEN"]["DB_TABLE"], config["LIGHT"]["DB_TABLE"], config["ACCELEROMETER"]["DB_TABLE"], config["APPLICATIONS_FOREGROUND"]["DB_TABLE"], config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]) -PHONE_SENSORS.extend(config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]) - -if len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0: - PHONE_SENSORS.append(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) -if len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) > 0: - PHONE_SENSORS.append(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) - - -rule readable_datetime: +rule phone_readable_datetime: input: - sensor_input = "data/raw/{pid}/{sensor}_raw.csv", + sensor_input = "data/raw/{pid}/phone_{sensor}_raw.csv", day_segments = "data/interim/day_segments/{pid}_day_segments.csv" params: timezones = None, fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"], day_segments_type = config["DAY_SEGMENTS"]["TYPE"], include_past_periodic_segments = config["DAY_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] - wildcard_constraints: - sensor = '(' + '|'.join([re.escape(x) for x in PHONE_SENSORS]) + ')' # only process smartphone sensors, not fitbit output: - "data/raw/{pid}/{sensor}_with_datetime.csv" + "data/raw/{pid}/phone_{sensor}_with_datetime.csv" script: "../src/data/readable_datetime.R" rule phone_sensed_bins: input: - all_sensors = optional_phone_sensed_bins_input + all_sensors = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor = map(str.lower, config["PHONE_VALID_SENSED_BINS"]["PHONE_SENSORS"])) params: bin_size = config["PHONE_VALID_SENSED_BINS"]["BIN_SIZE"] output: @@ -88,7 +76,7 @@ rule phone_sensed_bins: rule phone_sensed_timestamps: input: - all_sensors = optional_phone_sensed_timestamps_input + all_sensors = expand("data/raw/{{pid}}/{sensor}_raw.csv", sensor = map(str.lower, config["PHONE_VALID_SENSED_BINS"]["PHONE_SENSORS"])) output: "data/interim/{pid}/phone_sensed_timestamps.csv" script: @@ -112,55 +100,50 @@ rule unify_ios_android: participant_info = "data/external/{pid}" params: sensor = "{sensor}", - unifiable_sensors = {"calls": config["CALLS"]["DB_TABLE"], "battery": config["BATTERY"]["DB_TABLE"], "screen": config["SCREEN"]["DB_TABLE"], "ios_activity_recognition": config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"], "ios_conversation": config["CONVERSATION"]["DB_TABLE"]["IOS"]} output: "data/raw/{pid}/{sensor}_with_datetime_unified.csv" script: "../src/data/unify_ios_android.R" -rule process_location_types: +rule process_phone_location_types: input: - locations = "data/raw/{pid}/{sensor}_raw.csv", + locations = "data/raw/{pid}/phone_locations_raw.csv", phone_sensed_timestamps = "data/interim/{pid}/phone_sensed_timestamps.csv", params: - consecutive_threshold = config["LOCATIONS"]["FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD"], - time_since_valid_location = config["LOCATIONS"]["FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION"], - locations_to_use = "{locations_to_use}" - wildcard_constraints: - locations_to_use = '(ALL|GPS|FUSED_RESAMPLED)' + consecutive_threshold = config["PHONE_LOCATIONS"]["FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD"], + time_since_valid_location = config["PHONE_LOCATIONS"]["FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION"], + locations_to_use = config["PHONE_LOCATIONS"]["LOCATIONS_TO_USE"] output: - "data/interim/{pid}/{sensor}_processed_{locations_to_use}.csv" + "data/interim/{pid}/phone_locations_processed.csv" script: "../src/data/process_location_types.R" rule readable_datetime_location_processed: input: - sensor_input = expand("data/interim/{{pid}}/{sensor}_processed_{locations_to_use}.csv", sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"]), + sensor_input = "data/interim/{pid}/phone_locations_processed.csv", day_segments = "data/interim/day_segments/{pid}_day_segments.csv" params: timezones = None, fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"], day_segments_type = config["DAY_SEGMENTS"]["TYPE"], include_past_periodic_segments = config["DAY_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] - wildcard_constraints: - locations_to_use = '(ALL|GPS|FUSED_RESAMPLED)' output: - expand("data/interim/{{pid}}/{sensor}_processed_{locations_to_use}_with_datetime.csv", sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"]) + "data/interim/{pid}/phone_locations_processed_with_datetime.csv" script: "../src/data/readable_datetime.R" -rule application_genres: +rule phone_application_categories: input: - "data/raw/{pid}/{sensor}_with_datetime.csv" + "data/raw/{pid}/phone_applications_foreground_with_datetime.csv" params: - catalogue_source = config["APPLICATION_GENRES"]["CATALOGUE_SOURCE"], - catalogue_file = config["APPLICATION_GENRES"]["CATALOGUE_FILE"], - update_catalogue_file = config["APPLICATION_GENRES"]["UPDATE_CATALOGUE_FILE"], - scrape_missing_genres = config["APPLICATION_GENRES"]["SCRAPE_MISSING_GENRES"] + catalogue_source = config["PHONE_APPLICATIONS_FOREGROUND"]["APPLICATION_CATEGORIES"]["CATALOGUE_SOURCE"], + catalogue_file = config["PHONE_APPLICATIONS_FOREGROUND"]["APPLICATION_CATEGORIES"]["CATALOGUE_FILE"], + update_catalogue_file = config["PHONE_APPLICATIONS_FOREGROUND"]["APPLICATION_CATEGORIES"]["UPDATE_CATALOGUE_FILE"], + scrape_missing_genres = config["PHONE_APPLICATIONS_FOREGROUND"]["APPLICATION_CATEGORIES"]["SCRAPE_MISSING_CATEGORIES"] output: - "data/raw/{pid}/{sensor}_with_datetime_with_genre.csv" + "data/raw/{pid}/phone_applications_foreground_with_datetime_with_categories.csv" script: - "../src/data/application_genres.R" + "../src/data/application_categories.R" rule fitbit_heartrate_with_datetime: input: @@ -196,11 +179,3 @@ rule fitbit_sleep_with_datetime: intraday_data = "data/raw/{pid}/fitbit_sleep_intraday_with_datetime.csv" script: "../src/data/fitbit_readable_datetime.py" - -rule join_wifi_tables: - input: - unpack(optional_wifi_input) - output: - "data/raw/{pid}/wifi_with_datetime_visibleandconnected.csv" - script: - "../src/data/join_visible_and_connected_wifi.R" \ No newline at end of file diff --git a/src/data/application_genres.R b/src/data/application_categories.R similarity index 100% rename from src/data/application_genres.R rename to src/data/application_categories.R diff --git a/src/data/assign_to_day_segment.R b/src/data/assign_to_day_segment.R index 02a046ba..d58cbdfa 100644 --- a/src/data/assign_to_day_segment.R +++ b/src/data/assign_to_day_segment.R @@ -2,166 +2,163 @@ library("tidyverse") library("lubridate") options(scipen=999) -find_segments_frequency <- function(local_date, local_time, local_timezone, segments){ - - assigned_segments <- segments[segments$segment_start<= local_time & segments$segment_end >= local_time, ] - assigned_segments["segment_start_ts"] = as.numeric(lubridate::as_datetime(stringi::stri_c(local_date,assigned_segments$segment_id_start_time), tz = local_timezone)) * 1000 - assigned_segments["segment_end_ts"] = as.numeric(lubridate::as_datetime(stringi::stri_c(local_date,assigned_segments$segment_id_end_time), tz = local_timezone)) * 1000 + 999 - - return(stringi::stri_c(stringi::stri_c("[", - assigned_segments[["label"]], "#", - local_date, " ", - assigned_segments[["segment_id_start_time"]], ",", - local_date, " ", - assigned_segments[["segment_id_end_time"]], ";", - assigned_segments[["segment_start_ts"]], ",", - assigned_segments[["segment_end_ts"]], - "]"), collapse = "|")) +day_type_delay <- function(day_type, include_past_periodic_segments){ + delay <- day_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == day_type) %>% arrange(-length_duration) %>% pull(length_duration) %>% first() + return(if_else(is.na(delay) | include_past_periodic_segments == FALSE, duration("0days"), delay)) } -find_segments_periodic <- function(timestamp, segments){ - # crossing and pivot_longer make segments a tibble, thus we need to extract [["segment_id"]] - return(stringi::stri_c(segments[[1]][segments[[1]]$segment_start_ts<= timestamp & segments[[1]]$segment_end_ts >= timestamp, "segment_id"][["segment_id"]], collapse = "|")) +get_segment_dates <- function(data, local_timezone, day_type, delay){ + dates <- data %>% + distinct(local_date) %>% + mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>% + complete(local_date_obj = seq(date(min(local_date_obj) - delay), max(local_date_obj), by="days")) %>% + mutate(local_date = replace_na(as.character(date(local_date_obj)))) + + if(day_type == "every_day") + dates <- dates %>% mutate(every_day = 0) + else if (day_type == "wday") + dates <- dates %>% mutate(wday = wday(local_date_obj, week_start = 1)) + else if (day_type == "mday") + dates <- dates %>% mutate(mday = mday(local_date_obj)) + else if (day_type == "qday") + dates <- dates %>% mutate(qday = qday(local_date_obj)) + else if (day_type == "yday") + dates <- dates %>% mutate(yday = yday(local_date_obj)) + return(dates) } -find_segments_event <- function(timestamp, segments){ - # segments is a data.frame, we don't need to extract [["segment_id"]] like in find_segments_periodic - return(stringi::stri_c(segments[[1]][segments[[1]]$segment_start_ts<= timestamp & segments[[1]]$segment_end_ts >= timestamp, "segment_id"], collapse = "|")) +assign_rows_to_segments <- function(nested_data, nested_inferred_day_segments){ + nested_data <- nested_data %>% mutate(assigned_segments = "") + for(i in 1:nrow(nested_inferred_day_segments)) { + segment <- nested_inferred_day_segments[i,] + nested_data$assigned_segments <- ifelse(segment$segment_start_ts<= nested_data$timestamp & segment$segment_end_ts >= nested_data$timestamp, + stringi::stri_c(nested_data$assigned_segments, segment$segment_id, sep = "|"), nested_data$assigned_segments) + } + nested_data$assigned_segments <- substring(nested_data$assigned_segments, 2) + return(nested_data) +} + +assign_rows_to_segments_frequency <- function(nested_data, nested_timezone, day_segments){ + for(i in 1:nrow(day_segments)) { + segment <- day_segments[i,] + nested_data$assigned_segments <- ifelse(segment$segment_start_ts<= nested_data$local_time_obj & segment$segment_end_ts >= nested_data$local_time_obj, + # The segment_id is assambled on the fly because it depends on each row's local_date and timezone + stringi::stri_c("[", + segment[["label"]], "#", + nested_data$local_date, " ", + segment[["segment_id_start_time"]], ",", + nested_data$local_date, " ", + segment[["segment_id_end_time"]], ";", + as.numeric(lubridate::as_datetime(stringi::stri_c(nested_data$local_date, segment$segment_id_start_time), tz = nested_timezone)) * 1000, ",", + as.numeric(lubridate::as_datetime(stringi::stri_c(nested_data$local_date, segment$segment_id_end_time), tz = nested_timezone)) * 1000 + 999, + "]"), + nested_data$assigned_segments) + } + return(nested_data) } assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type, include_past_periodic_segments){ - + if(nrow(sensor_data) == 0) return(sensor_data %>% mutate(assigned_segments = NA)) - - if(day_segments_type == "FREQUENCY"){ #FREQUENCY + + if(day_segments_type == "FREQUENCY"){ day_segments <- day_segments %>% mutate(start_time = lubridate::hm(start_time), end_time = start_time + minutes(length) - seconds(1), segment_id_start_time = paste(str_pad(hour(start_time),2, pad="0"), str_pad(minute(start_time),2, pad="0"), str_pad(second(start_time),2, pad="0"),sep =":"), segment_id_end_time = paste(str_pad(hour(ymd("1970-01-01") + end_time),2, pad="0"), str_pad(minute(ymd("1970-01-01") + end_time),2, pad="0"), str_pad(second(ymd("1970-01-01") + end_time),2, pad="0"),sep =":"), # add ymd("1970-01-01") to get a real time instead of duration - segment_start = as.numeric(start_time), - segment_end = as.numeric(end_time)) - - sensor_data <- sensor_data %>% mutate(local_time_obj = as.numeric(lubridate::hms(local_time)), - assigned_segments = pmap_chr(list(local_date, local_time_obj, local_timezone), find_segments_frequency, day_segments)) %>% select(-local_time_obj) + segment_start_ts = as.numeric(start_time), + segment_end_ts = as.numeric(end_time)) - } else if (day_segments_type == "PERIODIC"){ #PERIODIC + sensor_data <- sensor_data %>% mutate(local_time_obj = as.numeric(lubridate::hms(local_time)), + assigned_segments = "") + + sensor_data <- sensor_data %>% + group_by(local_timezone) %>% + nest() %>% + mutate(data = map2(data, local_timezone, assign_rows_to_segments_frequency, day_segments)) %>% + unnest(cols = data) %>% + arrange(timestamp) %>% + select(-local_time_obj) + + return(sensor_data) + + + } else if (day_segments_type == "PERIODIC"){ # We need to take into account segment start dates that could include the first day of data day_segments <- day_segments %>% mutate(length_duration = duration(length)) - wday_delay <- day_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == "wday") %>% arrange(-length_duration) %>% pull(length_duration) %>% first() - wday_delay <- if_else(is.na(wday_delay) | include_past_periodic_segments == FALSE, duration("0days"), wday_delay) - - mday_delay <- day_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == "mday") %>% arrange(-length_duration) %>% pull(length_duration) %>% first() - mday_delay <- if_else(is.na(mday_delay) | include_past_periodic_segments == FALSE, duration("0days"), mday_delay) - - qday_delay <- day_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == "qday") %>% arrange(-length_duration) %>% pull(length_duration) %>% first() - qday_delay <- if_else(is.na(qday_delay) | include_past_periodic_segments == FALSE, duration("0days"), qday_delay) - - yday_delay <- day_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == "yday") %>% arrange(-length_duration) %>% pull(length_duration) %>% first() - yday_delay <- if_else(is.na(yday_delay) | include_past_periodic_segments == FALSE, duration("0days"), yday_delay) + every_day_delay <- duration("0days") + wday_delay <- day_type_delay("wday", include_past_periodic_segments) + mday_delay <- day_type_delay("mday", include_past_periodic_segments) + qday_delay <- day_type_delay("qday", include_past_periodic_segments) + yday_delay <- day_type_delay("yday", include_past_periodic_segments) sensor_data <- sensor_data %>% - # mutate(row_n = row_number()) %>% group_by(local_timezone) %>% nest() %>% # get existent days that we need to start segments from - mutate(every_date = map(data, ~.x %>% - distinct(local_date) %>% - mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>% - complete(local_date_obj = seq(min(local_date_obj), max(local_date_obj), by="days")) %>% - mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>% - mutate(every_day = 0)), - week_dates = map(data, ~.x %>% - distinct(local_date) %>% - mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>% - complete(local_date_obj = seq(date(min(local_date_obj) - wday_delay), max(local_date_obj), by="days")) %>% - mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>% - mutate(wday = wday(local_date_obj, week_start = 1)) ), - month_dates = map(data, ~.x %>% - distinct(local_date) %>% - mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>% - complete(local_date_obj = seq(date(min(local_date_obj) - mday_delay), max(local_date_obj), by="days")) %>% - mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>% - mutate(mday = mday(local_date_obj))), - quarter_dates = map(data, ~.x %>% - distinct(local_date) %>% - mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>% - complete(local_date_obj = seq(date(min(local_date_obj) - qday_delay), max(local_date_obj), by="days")) %>% - mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>% - mutate(qday = qday(local_date_obj)) ), - year_dates = map(data, ~.x %>% - distinct(local_date) %>% - mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>% - complete(local_date_obj = seq(date(min(local_date_obj) - yday_delay), max(local_date_obj), by="days")) %>% - mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>% - mutate(yday = yday(local_date_obj)) ), + mutate(every_date = map2(data, local_timezone, get_segment_dates, "every_day", every_day_delay), + week_dates = map2(data, local_timezone, get_segment_dates, "wday", wday_delay), + month_dates = map2(data, local_timezone, get_segment_dates, "mday", mday_delay), + quarter_dates = map2(data, local_timezone, get_segment_dates, "qday", qday_delay), + year_dates = map2(data, local_timezone, get_segment_dates, "yday", yday_delay), existent_dates = pmap(list(every_date, week_dates, month_dates, quarter_dates, year_dates), - function(every_date, week_dates, month_dates, quarter_dates, year_dates) reduce(list(every_date, week_dates,month_dates, quarter_dates, year_dates), .f=full_join)), - every_date = NULL, - week_dates = NULL, - month_dates = NULL, - quarter_dates = NULL, - year_dates = NULL, - # build the actual day segments taking into account the users requested leangth and repeat schedule + function(every_date, week_dates, month_dates, quarter_dates, year_dates) reduce(list(every_date, week_dates,month_dates, quarter_dates, year_dates), .f=full_join)), + # build the actual day segments taking into account the users requested length and repeat schedule inferred_day_segments = map(existent_dates, ~ crossing(day_segments, .x) %>% pivot_longer(cols = c(every_day,wday, mday, qday, yday), names_to = "day_type", values_to = "day_value") %>% filter(repeats_on == day_type & repeats_value == day_value) %>% - mutate(segment_id_start = lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM")), # The segment ids (label#start#end) are computed in UTC to avoid having different labels for instances of a segment that happen in different timezones + # The segment ids (segment_id_start and segment_id_end) are computed in UTC to avoid having different labels for instances of a segment that happen in different timezones + mutate(segment_id_start = lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM")), segment_id_end = segment_id_start + lubridate::duration(length), - segment_start_ts = as.numeric(lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = local_timezone)) * 1000, # The actual segments are computed using timestamps taking into account the timezone + # The actual segments are computed using timestamps taking into account the timezone + segment_start_ts = as.numeric(lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = local_timezone)) * 1000, segment_end_ts = segment_start_ts + as.numeric(lubridate::duration(length)) * 1000 + 999, segment_id = paste0("[", - paste0( - label,"#", - paste0(lubridate::date(segment_id_start), " ", - paste(str_pad(hour(segment_id_start),2, pad="0"), str_pad(minute(segment_id_start),2, pad="0"), str_pad(second(segment_id_start),2, pad="0"),sep =":"), ",", - lubridate::date(segment_id_end), " ", - paste(str_pad(hour(segment_id_end),2, pad="0"), str_pad(minute(segment_id_end),2, pad="0"), str_pad(second(segment_id_end),2, pad="0"),sep =":")),";", - paste0(segment_start_ts, ",", segment_end_ts) - ), + paste0(label,"#", + paste0(lubridate::date(segment_id_start), " ", + paste(str_pad(hour(segment_id_start),2, pad="0"), str_pad(minute(segment_id_start),2, pad="0"), str_pad(second(segment_id_start),2, pad="0"),sep =":"), ",", + lubridate::date(segment_id_end), " ", + paste(str_pad(hour(segment_id_end),2, pad="0"), str_pad(minute(segment_id_end),2, pad="0"), str_pad(second(segment_id_end),2, pad="0"),sep =":")),";", + paste0(segment_start_ts, ",", segment_end_ts)), "]")) %>% - select(segment_start_ts, segment_end_ts, segment_id) %>% - drop_na(segment_start_ts, segment_end_ts)), # drop day segments with an invalid start or end time (mostly due to daylight saving changes, e.g. 2020-03-08 02:00:00 EST does not exist, clock jumps from 1am to 3am) - data = map2(data, inferred_day_segments, ~ .x %>% mutate(row_date_time = as.numeric(lubridate::ymd_hms(local_date_time, tz = local_timezone)) * 1000, - assigned_segments = map_chr(row_date_time, ~find_segments_periodic(.x, inferred_day_segments)), - row_date_time = NULL)) + # drop day segments with an invalid start or end time (mostly due to daylight saving changes, e.g. 2020-03-08 02:00:00 EST does not exist, clock jumps from 01:59am to 03:00am) + drop_na(segment_start_ts, segment_end_ts)), + data = map2(data, inferred_day_segments, assign_rows_to_segments) ) %>% - select(-existent_dates, -inferred_day_segments) %>% + select(-existent_dates, -inferred_day_segments, -every_date, -week_dates, -month_dates, -quarter_dates, -year_dates) %>% unnest(cols = data) %>% arrange(timestamp) - - + } else if ( day_segments_type == "EVENT"){ sensor_data <- sensor_data %>% group_by(local_timezone) %>% nest() %>% - mutate(inferred_day_segments = map(local_timezone, ~ day_segments %>% mutate(shift = ifelse(shift == "0", "0seconds", shift), - segment_start_ts = event_timestamp + (as.integer(seconds(lubridate::duration(shift))) * ifelse(shift_direction >= 0, 1, -1) * 1000), - segment_end_ts = segment_start_ts + (as.integer(seconds(lubridate::duration(length))) * 1000), - segment_id_start = lubridate::as_datetime(segment_start_ts/1000, tz = .x), # these start and end datetime objects are for labeling only - segment_id_end = lubridate::as_datetime(segment_end_ts/1000, tz = .x), - segment_end_ts = segment_end_ts + 999, - segment_id = paste0("[", - paste0( - label,"#", - paste0(lubridate::date(segment_id_start), " ", - paste(str_pad(hour(segment_id_start),2, pad="0"), str_pad(minute(segment_id_start),2, pad="0"), str_pad(second(segment_id_start),2, pad="0"),sep =":"), ",", - lubridate::date(segment_id_end), " ", - paste(str_pad(hour(segment_id_end),2, pad="0"), str_pad(minute(segment_id_end),2, pad="0"), str_pad(second(segment_id_end),2, pad="0"),sep =":")),";", - paste0(segment_start_ts, ",", segment_end_ts) - ), - "]")) %>% - select(-segment_id_start, -segment_id_end)), - data = map2(data, inferred_day_segments, ~ .x %>% mutate(assigned_segments = map_chr(timestamp, ~find_segments_event(.x, inferred_day_segments))))) %>% + mutate(inferred_day_segments = map(local_timezone, ~ day_segments %>% + mutate(shift = ifelse(shift == "0", "0seconds", shift), + segment_start_ts = event_timestamp + (as.integer(seconds(lubridate::duration(shift))) * ifelse(shift_direction >= 0, 1, -1) * 1000), + segment_end_ts = segment_start_ts + (as.integer(seconds(lubridate::duration(length))) * 1000), + # these start and end datetime objects are for labeling only + segment_id_start = lubridate::as_datetime(segment_start_ts/1000, tz = .x), + segment_id_end = lubridate::as_datetime(segment_end_ts/1000, tz = .x), + segment_end_ts = segment_end_ts + 999, + segment_id = paste0("[", + paste0(label,"#", + paste0(lubridate::date(segment_id_start), " ", + paste(str_pad(hour(segment_id_start),2, pad="0"), str_pad(minute(segment_id_start),2, pad="0"), str_pad(second(segment_id_start),2, pad="0"),sep =":"), ",", + lubridate::date(segment_id_end), " ", + paste(str_pad(hour(segment_id_end),2, pad="0"), str_pad(minute(segment_id_end),2, pad="0"), str_pad(second(segment_id_end),2, pad="0"),sep =":")),";", + paste0(segment_start_ts, ",", segment_end_ts)), + "]"))), + data = map2(data, inferred_day_segments, assign_rows_to_segments)) %>% select(-inferred_day_segments) %>% unnest(data) %>% arrange(timestamp) - } - + return(sensor_data) } \ No newline at end of file diff --git a/src/data/download_dataset.R b/src/data/download_dataset.R index 7eadbf41..4876c5cf 100644 --- a/src/data/download_dataset.R +++ b/src/data/download_dataset.R @@ -40,9 +40,9 @@ is_multiplaform_participant <- function(dbEngine, device_ids, platforms){ participant <- snakemake@input[[1]] group <- snakemake@params[["group"]] table <- snakemake@params[["table"]] +sensor <- snakemake@params[["sensor"]] timezone <- snakemake@params[["timezone"]] aware_multiplatform_tables <- str_split(snakemake@params[["aware_multiplatform_tables"]], ",")[[1]] -unifiable_tables = snakemake@params[["unifiable_sensors"]] sensor_file <- snakemake@output[[1]] device_ids <- strsplit(readLines(participant, n=1), ",")[[1]] @@ -58,30 +58,26 @@ end_datetime_utc = format(as.POSIXct(paste0(end_date, " 23:59:59"),format="%Y/%m dbEngine <- dbConnect(MySQL(), default.file = "./.env", group = group) -# Get existent columns in table -available_columns <- colnames(dbGetQuery(dbEngine, paste0("SELECT * FROM ", table, " LIMIT 1"))) - -if("device_id" %in% available_columns){ - if(is_multiplaform_participant(dbEngine, device_ids, platforms)){ - sensor_data <- unify_raw_data(dbEngine, table, start_datetime_utc, end_datetime_utc, aware_multiplatform_tables, unifiable_tables, device_ids, platforms) - }else { - query <- paste0("SELECT * FROM ", table, " WHERE device_id IN ('", paste0(device_ids, collapse = "','"), "')") - if("timestamp" %in% available_columns && !(is.na(start_datetime_utc)) && !(is.na(end_datetime_utc)) && start_datetime_utc < end_datetime_utc) - query <- paste0(query, "AND timestamp BETWEEN 1000*UNIX_TIMESTAMP('", start_datetime_utc, "') AND 1000*UNIX_TIMESTAMP('", end_datetime_utc, "')") - sensor_data <- dbGetQuery(dbEngine, query) +if(is_multiplaform_participant(dbEngine, device_ids, platforms)){ + sensor_data <- unify_raw_data(dbEngine, table, sensor, start_datetime_utc, end_datetime_utc, aware_multiplatform_tables, device_ids, platforms) +}else { + # table has two elements for conversation and activity recognition (they store data on a different table for ios and android) + if(length(table) > 1){ + table <- table[[toupper(platforms[1])]] } - - if("timestamp" %in% available_columns) - sensor_data <- sensor_data %>% arrange(timestamp) - - # Unify device_id - sensor_data <- sensor_data %>% mutate(device_id = unified_device_id) - - # Droping duplicates on all columns except for _id or id - sensor_data <- sensor_data %>% distinct(!!!syms(setdiff(names(sensor_data), c("_id", "id")))) - -} else - stop(paste0("Table ", table, "does not have a device_id column (Aware ID) to link its data to a participant")) + query <- paste0("SELECT * FROM ", table, " WHERE device_id IN ('", paste0(device_ids, collapse = "','"), "')") + if(!(is.na(start_datetime_utc)) && !(is.na(end_datetime_utc)) && start_datetime_utc < end_datetime_utc) + query <- paste0(query, "AND timestamp BETWEEN 1000*UNIX_TIMESTAMP('", start_datetime_utc, "') AND 1000*UNIX_TIMESTAMP('", end_datetime_utc, "')") + sensor_data <- dbGetQuery(dbEngine, query) +} + +sensor_data <- sensor_data %>% arrange(timestamp) + +# Unify device_id +sensor_data <- sensor_data %>% mutate(device_id = unified_device_id) + +# Droping duplicates on all columns except for _id or id +sensor_data <- sensor_data %>% distinct(!!!syms(setdiff(names(sensor_data), c("_id", "id")))) write_csv(sensor_data, sensor_file) dbDisconnect(dbEngine) \ No newline at end of file diff --git a/src/data/unify_ios_android.R b/src/data/unify_ios_android.R index ca4211c5..bc93bada 100644 --- a/src/data/unify_ios_android.R +++ b/src/data/unify_ios_android.R @@ -4,11 +4,10 @@ source("src/data/unify_utils.R") sensor_data <- read.csv(snakemake@input[["sensor_data"]], stringsAsFactors = FALSE) participant_info <- snakemake@input[["participant_info"]] sensor <- snakemake@params[["sensor"]] -unifiable_sensors = snakemake@params[["unifiable_sensors"]] platforms <- strsplit(readLines(participant_info, n=2)[[2]], ",")[[1]] platform <- ifelse(platforms[1] == "multiple" | (length(platforms) > 1 & "android" %in% platforms & "ios" %in% platforms), "android", platforms[1]) -sensor_data <- unify_data(sensor_data, sensor, platform, unifiable_sensors) +sensor_data <- unify_data(sensor_data, sensor, platform) write.csv(sensor_data, snakemake@output[[1]], row.names = FALSE) diff --git a/src/data/unify_utils.R b/src/data/unify_utils.R index 0f23f311..1437b62f 100644 --- a/src/data/unify_utils.R +++ b/src/data/unify_utils.R @@ -101,7 +101,7 @@ clean_ios_activity_column <- function(ios_gar){ return(ios_gar) } -unify_ios_gar <- function(ios_gar){ +unify_ios_activity_recognition <- function(ios_gar){ # We only need to unify Google Activity Recognition data for iOS # discard rows where activities column is blank ios_gar <- ios_gar[-which(ios_gar$activities == ""), ] @@ -138,7 +138,7 @@ unify_ios_conversation <- function(conversation){ } # This function is used in download_dataset.R -unify_raw_data <- function(dbEngine, table, start_datetime_utc, end_datetime_utc, aware_multiplatform_tables, unifiable_tables, device_ids, platforms){ +unify_raw_data <- function(dbEngine, sensor_table, sensor, start_datetime_utc, end_datetime_utc, aware_multiplatform_tables, device_ids, platforms){ # If platforms is 'multiple', fetch each device_id's platform from aware_device, otherwise, use those given by the user if(length(platforms) == 1 && platforms == "multiple") devices_platforms <- dbGetQuery(dbEngine, paste0("SELECT device_id,brand FROM aware_device WHERE device_id IN ('", paste0(device_ids, collapse = "','"), "')")) %>% @@ -147,8 +147,9 @@ unify_raw_data <- function(dbEngine, table, start_datetime_utc, end_datetime_utc devices_platforms <- data.frame(device_id = device_ids, platform = platforms) # Get existent tables in database - available_tables_in_db <- dbGetQuery(dbEngine, paste0("SELECT table_name FROM information_schema.tables WHERE table_type = 'base table' AND table_schema='", dbGetInfo(dbEngine)$dbname,"'")) %>% pull(table_name) - + available_tables_in_db <- dbGetQuery(dbEngine, paste0("SELECT table_name FROM information_schema.tables WHERE table_schema='", dbGetInfo(dbEngine)$dbname,"'"))[[1]] + if(!any(sensor_table %in% available_tables_in_db)) + stop(paste0("You requested data from these table(s) ", paste0(sensor_table, collapse=", "), " but they don't exist in your database ", dbGetInfo(dbEngine)$dbname)) # Parse the table names for activity recognition and conversation plugins because they are different between android and ios ar_tables <- setNames(aware_multiplatform_tables[1:2], c("android", "ios")) conversation_tables <- setNames(aware_multiplatform_tables[3:4], c("android", "ios")) @@ -160,17 +161,19 @@ unify_raw_data <- function(dbEngine, table, start_datetime_utc, end_datetime_utc platform <- row$platform # Handle special cases when tables for the same sensor have different names for Android and iOS (AR and conversation) - if(table %in% ar_tables) + if(length(sensor_table) == 1) + table <- sensor_table + else if(all(sensor_table == ar_tables)) table <- ar_tables[[platform]] - else if(table %in% conversation_tables) + else if(all(sensor_table == conversation_tables)) table <- conversation_tables[[platform]] if(table %in% available_tables_in_db){ query <- paste0("SELECT * FROM ", table, " WHERE device_id IN ('", device_id, "')") - if("timestamp" %in% available_columns && !(is.na(start_datetime_utc)) && !(is.na(end_datetime_utc)) && start_datetime_utc < end_datetime_utc){ + if(!(is.na(start_datetime_utc)) && !(is.na(end_datetime_utc)) && start_datetime_utc < end_datetime_utc){ query <- paste0(query, "AND timestamp BETWEEN 1000*UNIX_TIMESTAMP('", start_datetime_utc, "') AND 1000*UNIX_TIMESTAMP('", end_datetime_utc, "')") } - sensor_data <- unify_data(dbGetQuery(dbEngine, query), table, platform, unifiable_tables) + sensor_data <- unify_data(dbGetQuery(dbEngine, query), sensor, platform) participants_sensordata <- append(participants_sensordata, list(sensor_data)) }else{ warning(paste0("Missing ", table, " table. We unified the data from ", paste0(devices_platforms$device_id, collapse = " and "), " but without records from this missing table for ", device_id)) @@ -182,25 +185,16 @@ unify_raw_data <- function(dbEngine, table, start_datetime_utc, end_datetime_utc } # This function is used in unify_ios_android.R and unify_raw_data function -unify_data <- function(sensor_data, sensor, platform, unifiable_sensors){ - if(sensor == unifiable_sensors$calls){ - if(platform == "ios"){ - sensor_data = unify_ios_calls(sensor_data) - } - # android calls remain unchanged - } else if(sensor == unifiable_sensors$battery){ - if(platform == "ios"){ - sensor_data = unify_ios_battery(sensor_data) - } - # android battery remains unchanged - } else if(sensor == unifiable_sensors$ios_activity_recognition){ - sensor_data = unify_ios_gar(sensor_data) - } else if(sensor == unifiable_sensors$screen){ - if(platform == "ios"){ - sensor_data = unify_ios_screen(sensor_data) - } - # android screen remains unchanged - } else if(sensor == unifiable_sensors$ios_conversation){ +unify_data <- function(sensor_data, sensor, platform){ + if(sensor == "phone_calls" & platform == "ios"){ + sensor_data = unify_ios_calls(sensor_data) + } else if(sensor == "phone_battery" & platform == "ios"){ + sensor_data = unify_ios_battery(sensor_data) + } else if(sensor == "phone_activity_recognition" & platform == "ios"){ + sensor_data = unify_ios_activity_recognition(sensor_data) + } else if(sensor == "phone_screen" & platform == "ios"){ + sensor_data = unify_ios_screen(sensor_data) + } else if(sensor == "phone_conversation" & platform == "ios"){ sensor_data = unify_ios_conversation(sensor_data) } return(sensor_data) diff --git a/src/features/accelerometer/panda/main.py b/src/features/phone_accelerometer/panda/main.py similarity index 100% rename from src/features/accelerometer/panda/main.py rename to src/features/phone_accelerometer/panda/main.py diff --git a/src/features/accelerometer/rapids/main.py b/src/features/phone_accelerometer/rapids/main.py similarity index 100% rename from src/features/accelerometer/rapids/main.py rename to src/features/phone_accelerometer/rapids/main.py diff --git a/src/features/activity_recognition/episodes/activity_recognition_episodes.R b/src/features/phone_activity_recognition/episodes/activity_recognition_episodes.R similarity index 100% rename from src/features/activity_recognition/episodes/activity_recognition_episodes.R rename to src/features/phone_activity_recognition/episodes/activity_recognition_episodes.R diff --git a/src/features/activity_recognition/rapids/main.py b/src/features/phone_activity_recognition/rapids/main.py similarity index 100% rename from src/features/activity_recognition/rapids/main.py rename to src/features/phone_activity_recognition/rapids/main.py diff --git a/src/features/applications_foreground/rapids/main.py b/src/features/phone_applications_foreground/rapids/main.py similarity index 100% rename from src/features/applications_foreground/rapids/main.py rename to src/features/phone_applications_foreground/rapids/main.py diff --git a/src/features/battery/episodes/battery_episodes.R b/src/features/phone_battery/episodes/battery_episodes.R similarity index 100% rename from src/features/battery/episodes/battery_episodes.R rename to src/features/phone_battery/episodes/battery_episodes.R diff --git a/src/features/battery/rapids/main.py b/src/features/phone_battery/rapids/main.py similarity index 100% rename from src/features/battery/rapids/main.py rename to src/features/phone_battery/rapids/main.py diff --git a/src/features/bluetooth/rapids/main.R b/src/features/phone_bluetooth/rapids/main.R similarity index 100% rename from src/features/bluetooth/rapids/main.R rename to src/features/phone_bluetooth/rapids/main.R diff --git a/src/features/calls/rapids/main.R b/src/features/phone_calls/rapids/main.R similarity index 100% rename from src/features/calls/rapids/main.R rename to src/features/phone_calls/rapids/main.R diff --git a/src/features/conversation/rapids/main.py b/src/features/phone_conversation/rapids/main.py similarity index 100% rename from src/features/conversation/rapids/main.py rename to src/features/phone_conversation/rapids/main.py diff --git a/src/features/light/rapids/main.py b/src/features/phone_light/rapids/main.py similarity index 100% rename from src/features/light/rapids/main.py rename to src/features/phone_light/rapids/main.py diff --git a/src/features/locations/barnett/library/AvgFlightDur.R b/src/features/phone_locations/barnett/library/AvgFlightDur.R similarity index 100% rename from src/features/locations/barnett/library/AvgFlightDur.R rename to src/features/phone_locations/barnett/library/AvgFlightDur.R diff --git a/src/features/locations/barnett/library/AvgFlightLen.R b/src/features/phone_locations/barnett/library/AvgFlightLen.R similarity index 100% rename from src/features/locations/barnett/library/AvgFlightLen.R rename to src/features/phone_locations/barnett/library/AvgFlightLen.R diff --git a/src/features/locations/barnett/library/Collapse2Pause.R b/src/features/phone_locations/barnett/library/Collapse2Pause.R similarity index 100% rename from src/features/locations/barnett/library/Collapse2Pause.R rename to src/features/phone_locations/barnett/library/Collapse2Pause.R diff --git a/src/features/locations/barnett/library/DailyMobilityPlots.R b/src/features/phone_locations/barnett/library/DailyMobilityPlots.R similarity index 100% rename from src/features/locations/barnett/library/DailyMobilityPlots.R rename to src/features/phone_locations/barnett/library/DailyMobilityPlots.R diff --git a/src/features/locations/barnett/library/DailyRoutineIndex.R b/src/features/phone_locations/barnett/library/DailyRoutineIndex.R similarity index 100% rename from src/features/locations/barnett/library/DailyRoutineIndex.R rename to src/features/phone_locations/barnett/library/DailyRoutineIndex.R diff --git a/src/features/locations/barnett/library/DayDist.R b/src/features/phone_locations/barnett/library/DayDist.R similarity index 100% rename from src/features/locations/barnett/library/DayDist.R rename to src/features/phone_locations/barnett/library/DayDist.R diff --git a/src/features/locations/barnett/library/DistanceTravelled.R b/src/features/phone_locations/barnett/library/DistanceTravelled.R similarity index 100% rename from src/features/locations/barnett/library/DistanceTravelled.R rename to src/features/phone_locations/barnett/library/DistanceTravelled.R diff --git a/src/features/locations/barnett/library/ExtractFlights.R b/src/features/phone_locations/barnett/library/ExtractFlights.R similarity index 100% rename from src/features/locations/barnett/library/ExtractFlights.R rename to src/features/phone_locations/barnett/library/ExtractFlights.R diff --git a/src/features/locations/barnett/library/ExtractTimePeriod.R b/src/features/phone_locations/barnett/library/ExtractTimePeriod.R similarity index 100% rename from src/features/locations/barnett/library/ExtractTimePeriod.R rename to src/features/phone_locations/barnett/library/ExtractTimePeriod.R diff --git a/src/features/locations/barnett/library/GPS2MobMat.R b/src/features/phone_locations/barnett/library/GPS2MobMat.R similarity index 100% rename from src/features/locations/barnett/library/GPS2MobMat.R rename to src/features/phone_locations/barnett/library/GPS2MobMat.R diff --git a/src/features/locations/barnett/library/GPSmobility-internal.R b/src/features/phone_locations/barnett/library/GPSmobility-internal.R similarity index 100% rename from src/features/locations/barnett/library/GPSmobility-internal.R rename to src/features/phone_locations/barnett/library/GPSmobility-internal.R diff --git a/src/features/locations/barnett/library/GetMobilityFeaturesMat.R b/src/features/phone_locations/barnett/library/GetMobilityFeaturesMat.R similarity index 100% rename from src/features/locations/barnett/library/GetMobilityFeaturesMat.R rename to src/features/phone_locations/barnett/library/GetMobilityFeaturesMat.R diff --git a/src/features/locations/barnett/library/GuessPause.R b/src/features/phone_locations/barnett/library/GuessPause.R similarity index 100% rename from src/features/locations/barnett/library/GuessPause.R rename to src/features/phone_locations/barnett/library/GuessPause.R diff --git a/src/features/locations/barnett/library/Hometime.R b/src/features/phone_locations/barnett/library/Hometime.R similarity index 100% rename from src/features/locations/barnett/library/Hometime.R rename to src/features/phone_locations/barnett/library/Hometime.R diff --git a/src/features/locations/barnett/library/InitializeParams.R b/src/features/phone_locations/barnett/library/InitializeParams.R similarity index 100% rename from src/features/locations/barnett/library/InitializeParams.R rename to src/features/phone_locations/barnett/library/InitializeParams.R diff --git a/src/features/locations/barnett/library/IsFlight.R b/src/features/phone_locations/barnett/library/IsFlight.R similarity index 100% rename from src/features/locations/barnett/library/IsFlight.R rename to src/features/phone_locations/barnett/library/IsFlight.R diff --git a/src/features/locations/barnett/library/LatLong2XY.R b/src/features/phone_locations/barnett/library/LatLong2XY.R similarity index 100% rename from src/features/locations/barnett/library/LatLong2XY.R rename to src/features/phone_locations/barnett/library/LatLong2XY.R diff --git a/src/features/locations/barnett/library/LocationAt.R b/src/features/phone_locations/barnett/library/LocationAt.R similarity index 100% rename from src/features/locations/barnett/library/LocationAt.R rename to src/features/phone_locations/barnett/library/LocationAt.R diff --git a/src/features/locations/barnett/library/MaxDiam.R b/src/features/phone_locations/barnett/library/MaxDiam.R similarity index 100% rename from src/features/locations/barnett/library/MaxDiam.R rename to src/features/phone_locations/barnett/library/MaxDiam.R diff --git a/src/features/locations/barnett/library/MaxDistBetweenTrajectories.R b/src/features/phone_locations/barnett/library/MaxDistBetweenTrajectories.R similarity index 100% rename from src/features/locations/barnett/library/MaxDistBetweenTrajectories.R rename to src/features/phone_locations/barnett/library/MaxDistBetweenTrajectories.R diff --git a/src/features/locations/barnett/library/MaxHomeDist.R b/src/features/phone_locations/barnett/library/MaxHomeDist.R similarity index 100% rename from src/features/locations/barnett/library/MaxHomeDist.R rename to src/features/phone_locations/barnett/library/MaxHomeDist.R diff --git a/src/features/locations/barnett/library/MaxRadius.R b/src/features/phone_locations/barnett/library/MaxRadius.R similarity index 100% rename from src/features/locations/barnett/library/MaxRadius.R rename to src/features/phone_locations/barnett/library/MaxRadius.R diff --git a/src/features/locations/barnett/library/MinsMissing.R b/src/features/phone_locations/barnett/library/MinsMissing.R similarity index 100% rename from src/features/locations/barnett/library/MinsMissing.R rename to src/features/phone_locations/barnett/library/MinsMissing.R diff --git a/src/features/locations/barnett/library/MobilityFeatures.R b/src/features/phone_locations/barnett/library/MobilityFeatures.R similarity index 100% rename from src/features/locations/barnett/library/MobilityFeatures.R rename to src/features/phone_locations/barnett/library/MobilityFeatures.R diff --git a/src/features/locations/barnett/library/MobmatQualityOK.R b/src/features/phone_locations/barnett/library/MobmatQualityOK.R similarity index 100% rename from src/features/locations/barnett/library/MobmatQualityOK.R rename to src/features/phone_locations/barnett/library/MobmatQualityOK.R diff --git a/src/features/locations/barnett/library/ProbPause.R b/src/features/phone_locations/barnett/library/ProbPause.R similarity index 100% rename from src/features/locations/barnett/library/ProbPause.R rename to src/features/phone_locations/barnett/library/ProbPause.R diff --git a/src/features/locations/barnett/library/ProgressBar.R b/src/features/phone_locations/barnett/library/ProgressBar.R similarity index 100% rename from src/features/locations/barnett/library/ProgressBar.R rename to src/features/phone_locations/barnett/library/ProgressBar.R diff --git a/src/features/locations/barnett/library/RadiusOfGyration.R b/src/features/phone_locations/barnett/library/RadiusOfGyration.R similarity index 100% rename from src/features/locations/barnett/library/RadiusOfGyration.R rename to src/features/phone_locations/barnett/library/RadiusOfGyration.R diff --git a/src/features/locations/barnett/library/RandomBridge.R b/src/features/phone_locations/barnett/library/RandomBridge.R similarity index 100% rename from src/features/locations/barnett/library/RandomBridge.R rename to src/features/phone_locations/barnett/library/RandomBridge.R diff --git a/src/features/locations/barnett/library/SigLocEntropy.R b/src/features/phone_locations/barnett/library/SigLocEntropy.R similarity index 100% rename from src/features/locations/barnett/library/SigLocEntropy.R rename to src/features/phone_locations/barnett/library/SigLocEntropy.R diff --git a/src/features/locations/barnett/library/SigLocs.R b/src/features/phone_locations/barnett/library/SigLocs.R similarity index 100% rename from src/features/locations/barnett/library/SigLocs.R rename to src/features/phone_locations/barnett/library/SigLocs.R diff --git a/src/features/locations/barnett/library/SigLocsVisited.R b/src/features/phone_locations/barnett/library/SigLocsVisited.R similarity index 100% rename from src/features/locations/barnett/library/SigLocsVisited.R rename to src/features/phone_locations/barnett/library/SigLocsVisited.R diff --git a/src/features/locations/barnett/library/SimulateMobilityGaps.R b/src/features/phone_locations/barnett/library/SimulateMobilityGaps.R similarity index 100% rename from src/features/locations/barnett/library/SimulateMobilityGaps.R rename to src/features/phone_locations/barnett/library/SimulateMobilityGaps.R diff --git a/src/features/locations/barnett/library/StdFlightDur.R b/src/features/phone_locations/barnett/library/StdFlightDur.R similarity index 100% rename from src/features/locations/barnett/library/StdFlightDur.R rename to src/features/phone_locations/barnett/library/StdFlightDur.R diff --git a/src/features/locations/barnett/library/StdFlightLen.R b/src/features/phone_locations/barnett/library/StdFlightLen.R similarity index 100% rename from src/features/locations/barnett/library/StdFlightLen.R rename to src/features/phone_locations/barnett/library/StdFlightLen.R diff --git a/src/features/locations/barnett/library/WriteSurveyAnswers2File.R b/src/features/phone_locations/barnett/library/WriteSurveyAnswers2File.R similarity index 100% rename from src/features/locations/barnett/library/WriteSurveyAnswers2File.R rename to src/features/phone_locations/barnett/library/WriteSurveyAnswers2File.R diff --git a/src/features/locations/barnett/library/plot.flights.R b/src/features/phone_locations/barnett/library/plot.flights.R similarity index 100% rename from src/features/locations/barnett/library/plot.flights.R rename to src/features/phone_locations/barnett/library/plot.flights.R diff --git a/src/features/locations/barnett/library/plotlimits.R b/src/features/phone_locations/barnett/library/plotlimits.R similarity index 100% rename from src/features/locations/barnett/library/plotlimits.R rename to src/features/phone_locations/barnett/library/plotlimits.R diff --git a/src/features/locations/barnett/main.R b/src/features/phone_locations/barnett/main.R similarity index 89% rename from src/features/locations/barnett/main.R rename to src/features/phone_locations/barnett/main.R index 08a61890..da013924 100644 --- a/src/features/locations/barnett/main.R +++ b/src/features/phone_locations/barnett/main.R @@ -3,7 +3,7 @@ library("dplyr") library("stringr") # Load Ian Barnett's code. Taken from https://scholar.harvard.edu/ibarnett/software/gpsmobility -file.sources = list.files(c("src/features/locations/barnett/library"), pattern="*.R$", full.names=TRUE, ignore.case=TRUE) +file.sources = list.files(c("src/features/phone_locations/barnett/library"), pattern="*.R$", full.names=TRUE, ignore.case=TRUE) sapply(file.sources,source,.GlobalEnv) create_empty_file <- function(requested_features){ @@ -52,10 +52,13 @@ barnett_features <- function(sensor_data_files, day_segment, params){ if (nrow(location) > 1){ # Filter by segment and skipping any non-daily segment location <- location %>% filter_data_by_segment(day_segment) - segment <- location %>% head(1) %>% pull(local_segment) - segment_data <- str_split(segment, "#")[[1]] - if(segment_data[[2]] != segment_data[[4]] || segment_data[[3]] != "00:00:00" || segment_data[[5]] != "23:59:59"){ - warning(paste("Barnett's location features cannot be computed for day segmentes that are not daily (cover 00:00:00 to 23:59:59 of every day). Skipping for ", segment)) + + datetime_start_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 00:00:00" + datetime_end_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 23:59:59" + location <- location %>% mutate(is_daily = str_detect(local_segment, paste0(day_segment, "#", datetime_start_regex, ",", datetime_end_regex))) + + if(!all(location$is_daily)){ + message(paste("Barnett's location features cannot be computed for day segmentes that are not daily (cover 00:00:00 to 23:59:59 of every day). Skipping ", day_segment)) location_features <- create_empty_file(requested_features) } else { # Count how many minutes of data we use to get location features diff --git a/src/features/locations/doryab/main.py b/src/features/phone_locations/doryab/main.py similarity index 100% rename from src/features/locations/doryab/main.py rename to src/features/phone_locations/doryab/main.py diff --git a/src/features/messages/rapids/main.R b/src/features/phone_messages/rapids/main.R similarity index 100% rename from src/features/messages/rapids/main.R rename to src/features/phone_messages/rapids/main.R diff --git a/src/features/screen/episodes/screen_episodes.R b/src/features/phone_screen/episodes/screen_episodes.R similarity index 100% rename from src/features/screen/episodes/screen_episodes.R rename to src/features/phone_screen/episodes/screen_episodes.R diff --git a/src/features/screen/rapids/main.py b/src/features/phone_screen/rapids/main.py similarity index 100% rename from src/features/screen/rapids/main.py rename to src/features/phone_screen/rapids/main.py diff --git a/src/features/wifi/rapids/main.R b/src/features/phone_wifi_connected/rapids/main.R similarity index 100% rename from src/features/wifi/rapids/main.R rename to src/features/phone_wifi_connected/rapids/main.R diff --git a/src/features/phone_wifi_visible/rapids/main.R b/src/features/phone_wifi_visible/rapids/main.R new file mode 100644 index 00000000..7c4ea072 --- /dev/null +++ b/src/features/phone_wifi_visible/rapids/main.R @@ -0,0 +1,46 @@ +library(dplyr) + +compute_wifi_feature <- function(data, feature, day_segment){ + data <- data %>% filter_data_by_segment(day_segment) + if(feature %in% c("countscans", "uniquedevices")){ + data <- data %>% group_by(local_segment) + data <- switch(feature, + "countscans" = data %>% summarise(!!paste("wifi_rapids", feature, sep = "_") := n()), + "uniquedevices" = data %>% summarise(!!paste("wifi_rapids", feature, sep = "_") := n_distinct(bssid))) + return(data) + } else if(feature == "countscansmostuniquedevice"){ + # Get the most scanned device + mostuniquedevice <- data %>% + group_by(bssid) %>% + mutate(N=n()) %>% + ungroup() %>% + filter(N == max(N)) %>% + head(1) %>% # if there are multiple device with the same amount of scans pick the first one only + pull(bssid) + return(data %>% + filter(bssid == mostuniquedevice) %>% + group_by(local_segment) %>% + summarise(!!paste("wifi_rapids", feature, sep = "_") := n()) %>% + replace(is.na(.), 0)) + } +} + +rapids_features <- function(sensor_data_files, day_segment, provider){ + wifi_data <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE) + requested_features <- provider[["FEATURES"]] + # Output dataframe + features = data.frame(local_segment = character(), stringsAsFactors = FALSE) + + # The name of the features this function can compute + base_features_names <- c("countscans", "uniquedevices", "countscansmostuniquedevice") + + # The subset of requested features this function can compute + features_to_compute <- intersect(base_features_names, requested_features) + + for(feature_name in features_to_compute){ + feature <- compute_wifi_feature(wifi_data, feature_name, day_segment) + features <- merge(features, feature, by="local_segment", all = TRUE) + } + + return(features) +} diff --git a/src/features/utils/resample_episodes.R b/src/features/utils/resample_episodes.R index b25f05ce..eac57f05 100644 --- a/src/features/utils/resample_episodes.R +++ b/src/features/utils/resample_episodes.R @@ -1,6 +1,8 @@ source("renv/activate.R") library("dplyr") library("tidyr") +library("tibble") +options(scipen=999) # Using mostly indeixng instead of tidyr because is faster resampled_episodes <- read.csv(snakemake@input[[1]]) diff --git a/src/features/utils/utils.py b/src/features/utils/utils.py index 86741291..d2060aee 100644 --- a/src/features/utils/utils.py +++ b/src/features/utils/utils.py @@ -74,7 +74,7 @@ def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_file sensor_features = pd.DataFrame(columns=["local_segment"]) day_segments_labels = pd.read_csv(day_segments_file, header=0) if "FEATURES" not in provider: - raise ValueError("Provider config[{}][PROVIDERS][{}] is missing a FEATURES attribute in config.yaml".format(sensor_key.upper(), provider_key)) + raise ValueError("Provider config[{}][PROVIDERS][{}] is missing a FEATURES attribute in config.yaml".format(sensor_key.upper(), provider_key.upper())) if provider["COMPUTE"] == True: