diff --git a/Snakefile b/Snakefile index 30f1c64d..0bd59766 100644 --- a/Snakefile +++ b/Snakefile @@ -36,9 +36,9 @@ for provider in config["PHONE_MESSAGES"]["PROVIDERS"].keys(): for provider in config["PHONE_CALLS"]["PROVIDERS"].keys(): if config["PHONE_CALLS"]["PROVIDERS"][provider]["COMPUTE"]: - files_to_compute.extend(expand("data/raw/{pid}/phone_calls_raw.csv", pid=config["PIDS"], sensor=config["PHONE_CALLS"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime.csv", pid=config["PIDS"], sensor=config["PHONE_CALLS"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["PHONE_CALLS"]["DB_TABLE"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_calls_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime_unified.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_calls_features/phone_calls_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_CALLS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_calls.csv", pid=config["PIDS"])) @@ -122,23 +122,6 @@ for provider in config["PHONE_WIFI_CONNECTED"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/interim/{pid}/phone_wifi_connected_features/phone_wifi_connected_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_WIFI_CONNECTED"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_wifi_connected.csv", pid=config["PIDS"])) -if config["HEARTRATE"]["COMPUTE"]: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["HEARTRATE"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_{fitbit_data_type}_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["summary", "intraday"])) - files_to_compute.extend(expand("data/processed/{pid}/fitbit_heartrate_{day_segment}.csv", pid = config["PIDS"], day_segment = config["HEARTRATE"]["DAY_SEGMENTS"])) - -if config["STEP"]["COMPUTE"]: - if config["STEP"]["EXCLUDE_SLEEP"]["EXCLUDE"] == True and config["STEP"]["EXCLUDE_SLEEP"]["TYPE"] == "FITBIT_BASED": - files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_{fitbit_data_type}_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["summary"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["STEP"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/fitbit_step_{fitbit_data_type}_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["intraday"])) - files_to_compute.extend(expand("data/processed/{pid}/fitbit_step_{day_segment}.csv", pid = config["PIDS"], day_segment = config["STEP"]["DAY_SEGMENTS"])) - -if config["SLEEP"]["COMPUTE"]: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["SLEEP"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_{fitbit_data_type}_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["intraday", "summary"])) - files_to_compute.extend(expand("data/processed/{pid}/fitbit_sleep_{day_segment}.csv", pid = config["PIDS"], day_segment = config["SLEEP"]["DAY_SEGMENTS"])) - for provider in config["PHONE_CONVERSATION"]["PROVIDERS"].keys(): if config["PHONE_CONVERSATION"]["PROVIDERS"][provider]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/phone_conversation_raw.csv", pid=config["PIDS"])) @@ -150,10 +133,10 @@ for provider in config["PHONE_CONVERSATION"]["PROVIDERS"].keys(): for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys(): if config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["COMPUTE"]: if config["PHONE_LOCATIONS"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED": - if config["PHONE_LOCATIONS"]["DB_TABLE"] in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]: + if "PHONE_LOCATIONS" in config["PHONE_VALID_SENSED_BINS"]["PHONE_SENSORS"]: files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"])) else: - raise ValueError("Error: Add your locations table (and as many sensor tables as you have) to [PHONE_VALID_SENSED_BINS][DB_TABLES] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)") + raise ValueError("Error: Add PHONE_LOCATIONS (and as many PHONE_SENSORS as you have) to [PHONE_VALID_SENSED_BINS][PHONE_SENSORS] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)") files_to_compute.extend(expand("data/raw/{pid}/phone_locations_raw.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed.csv", pid=config["PIDS"])) @@ -161,6 +144,30 @@ for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/interim/{pid}/phone_locations_features/phone_locations_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_locations.csv", pid=config["PIDS"])) + +for provider in config["FITBIT_HEARTRATE"]["PROVIDERS"].keys(): + if config["FITBIT_HEARTRATE"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_raw.csv", pid=config["PIDS"])) + # files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_{fitbit_data_type}_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["summary", "intraday"])) + # files_to_compute.extend(expand("data/processed/{pid}/fitbit_heartrate_{day_segment}.csv", pid = config["PIDS"], day_segment = config["HEARTRATE"]["DAY_SEGMENTS"])) + +for provider in config["FITBIT_STEPS"]["PROVIDERS"].keys(): + if config["FITBIT_STEPS"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_raw.csv", pid=config["PIDS"])) +# if config["STEP"]["COMPUTE"]: +# if config["STEP"]["EXCLUDE_SLEEP"]["EXCLUDE"] == True and config["STEP"]["EXCLUDE_SLEEP"]["TYPE"] == "FITBIT_BASED": +# files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_{fitbit_data_type}_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["summary"])) +# files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["STEP"]["TABLE"])) +# files_to_compute.extend(expand("data/raw/{pid}/fitbit_step_{fitbit_data_type}_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["intraday"])) +# files_to_compute.extend(expand("data/processed/{pid}/fitbit_step_{day_segment}.csv", pid = config["PIDS"], day_segment = config["STEP"]["DAY_SEGMENTS"])) + +for provider in config["FITBIT_SLEEP"]["PROVIDERS"].keys(): + if config["FITBIT_SLEEP"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_raw.csv", pid=config["PIDS"])) +# files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_{fitbit_data_type}_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["intraday", "summary"])) +# files_to_compute.extend(expand("data/processed/{pid}/fitbit_sleep_{day_segment}.csv", pid = config["PIDS"], day_segment = config["SLEEP"]["DAY_SEGMENTS"])) + + # visualization for data exploration if config["HEATMAP_FEATURES_CORRELATIONS"]["PLOT"]: files_to_compute.extend(expand("reports/data_exploration/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/heatmap_features_correlations.html", min_valid_hours_per_day=config["HEATMAP_FEATURES_CORRELATIONS"]["MIN_VALID_HOURS_PER_DAY"], min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"])) diff --git a/config.yaml b/config.yaml index 4183c644..55b69a9d 100644 --- a/config.yaml +++ b/config.yaml @@ -8,33 +8,46 @@ DAY_SEGMENTS: &day_segments FILE: "data/external/daysegments_periodic.csv" INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, if set to TRUE we consider day segments back enough in the past as to include the first day of data -# Global timezone -# Use codes from https://en.wikipedia.org/wiki/List_of_tz_database_time_zones -# Double check your code, for example EST is not US Eastern Time. +# Use tz codes from https://en.wikipedia.org/wiki/List_of_tz_database_time_zones. Double check your code, for example EST is not US Eastern Time. TIMEZONE: &timezone America/New_York DATABASE_GROUP: &database_group MY_GROUP -DOWNLOAD_PARTICIPANTS: - IGNORED_DEVICE_IDS: [] # for example "5a1dd68c-6cd1-48fe-ae1e-14344ac5215f" - GROUP: *database_group +PARTICIPANT_FILES: # run snakemake -j1 -R parse_participant_files + PHONE_SECTION: + INCLUDE: TRUE + PARSED_FROM: AWARE_DEVICE_TABLE #AWARE_DEVICE_TABLE or CSV_FILE + PARSED_SOURCE: *database_group # DB credentials group or CSV file path. If CSV file, it should have: device_id, pid (optional), label (optional), start_date (optional), end_date (optional) + IGNORED_DEVICE_IDS: [] + FITBIT_SECTION: + INCLUDE: FALSE + SAME_AS_PHONE: FALSE # If TRUE, all config below is ignored + PARSED_FROM: CSV_FILE + PARSED_SOURCE: "external/my_fitbit_participants.csv" # CSV file should have: device_id, pid (optional), label (optional), start_date (optional), end_date (optional) -# Download data config -DOWNLOAD_DATASET: - GROUP: *database_group - -# Readable datetime config -READABLE_DATETIME: - FIXED_TIMEZONE: *timezone +SENSOR_DATA: + PHONE: + SOURCE: + TYPE: DATABASE + DATABASE_GROUP: *database_group + DEVICE_ID_COLUMN: device_id # column name + TIMEZONE: + TYPE: SINGLE # SINGLE or MULTIPLE + VALUE: *timezone # IF TYPE=SINGLE, timezone code (e.g. America/New_York, see attribute TIMEZONE above). If TYPE=MULTIPLE, a table in your database with two columns (timestamp, timezone) where timestamp is a unix timestamp and timezone is one of https://en.wikipedia.org/wiki/List_of_tz_database_time_zones + FITBIT: + SOURCE: + TYPE: DATABASE # DATABASE or CSV_FILES (set each FITBIT_SENSOR TABLE attribute accordingly) + DATABASE_GROUP: *database_group + DEVICE_ID_COLUMN: device_id # column name PHONE_VALID_SENSED_BINS: COMPUTE: False # This flag is automatically ignored (set to True) if you are extracting PHONE_VALID_SENSED_DAYS or screen or Barnett's location features BIN_SIZE: &bin_size 5 # (in minutes) # Add as many PHONE sensors as you have, they all improve the computation of PHONE_VALID_SENSED_BINS and PHONE_VALID_SENSED_DAYS. # If you are extracting screen or Barnett/Doryab location features, PHONE_SCREEN and PHONE_LOCATIONS tables are mandatory. - # You can choose any of the keys shown below, just make sure its DB_TABLE exists in your database! + # You can choose any of the keys shown below, just make sure its TABLE exists in your database! # PHONE_MESSAGES, PHONE_CALLS, PHONE_LOCATIONS, PHONE_BLUETOOTH, PHONE_ACTIVITY_RECOGNITION, PHONE_BATTERY, PHONE_SCREEN, PHONE_LIGHT, # PHONE_ACCELEROMETER, PHONE_APPLICATIONS_FOREGROUND, PHONE_WIFI_VISIBLE, PHONE_WIFI_CONNECTED, PHONE_CONVERSATION PHONE_SENSORS: [] @@ -46,7 +59,7 @@ PHONE_VALID_SENSED_DAYS: # Communication SMS features config, TYPES and FEATURES keys need to match PHONE_MESSAGES: - DB_TABLE: messages + TABLE: messages PROVIDERS: RAPIDS: COMPUTE: False @@ -59,10 +72,10 @@ PHONE_MESSAGES: # Communication call features config, TYPES and FEATURES keys need to match PHONE_CALLS: - DB_TABLE: calls + TABLE: calls PROVIDERS: RAPIDS: - COMPUTE: False + COMPUTE: True CALL_TYPES: [missed, incoming, outgoing] FEATURES: missed: [count, distinctcontacts, timefirstcall, timelastcall, countmostfrequentcontact] @@ -72,7 +85,7 @@ PHONE_CALLS: SRC_FOLDER: "rapids" # inside src/features/phone_calls PHONE_LOCATIONS: - DB_TABLE: locations + TABLE: locations LOCATIONS_TO_USE: FUSED_RESAMPLED # ALL, GPS OR FUSED_RESAMPLED FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row @@ -99,7 +112,7 @@ PHONE_LOCATIONS: SRC_LANGUAGE: "r" PHONE_BLUETOOTH: - DB_TABLE: bluetooth + TABLE: bluetooth PROVIDERS: RAPIDS: COMPUTE: False @@ -109,12 +122,12 @@ PHONE_BLUETOOTH: PHONE_ACTIVITY_RECOGNITION: - DB_TABLE: + TABLE: ANDROID: plugin_google_activity_recognition IOS: plugin_ios_activity_recognition PROVIDERS: RAPIDS: - COMPUTE: False + COMPUTE: True FEATURES: ["count", "mostcommonactivity", "countuniqueactivities", "durationstationary", "durationmobile", "durationvehicle"] ACTIVITY_CLASSES: STATIONARY: ["still", "tilting"] @@ -124,7 +137,7 @@ PHONE_ACTIVITY_RECOGNITION: SRC_LANGUAGE: "python" PHONE_BATTERY: - DB_TABLE: battery + TABLE: battery PROVIDERS: RAPIDS: COMPUTE: False @@ -133,7 +146,7 @@ PHONE_BATTERY: SRC_LANGUAGE: "python" PHONE_SCREEN: - DB_TABLE: screen + TABLE: screen PROVIDERS: RAPIDS: COMPUTE: False @@ -146,7 +159,7 @@ PHONE_SCREEN: SRC_LANGUAGE: "python" PHONE_LIGHT: - DB_TABLE: light + TABLE: light PROVIDERS: RAPIDS: COMPUTE: False @@ -155,7 +168,7 @@ PHONE_LIGHT: SRC_LANGUAGE: "python" PHONE_ACCELEROMETER: - DB_TABLE: accelerometer + TABLE: accelerometer PROVIDERS: RAPIDS: COMPUTE: False @@ -173,7 +186,7 @@ PHONE_ACCELEROMETER: SRC_LANGUAGE: "python" PHONE_APPLICATIONS_FOREGROUND: - DB_TABLE: applications_foreground + TABLE: applications_foreground APPLICATION_CATEGORIES: CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store) CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv" @@ -194,7 +207,7 @@ PHONE_APPLICATIONS_FOREGROUND: SRC_LANGUAGE: "python" PHONE_WIFI_VISIBLE: - DB_TABLE: "wifi" + TABLE: "wifi" PROVIDERS: RAPIDS: COMPUTE: False @@ -203,7 +216,7 @@ PHONE_WIFI_VISIBLE: SRC_LANGUAGE: "r" PHONE_WIFI_CONNECTED: - DB_TABLE: "sensor_wifi" + TABLE: "sensor_wifi" PROVIDERS: RAPIDS: COMPUTE: False @@ -212,12 +225,12 @@ PHONE_WIFI_CONNECTED: SRC_LANGUAGE: "r" PHONE_CONVERSATION: - DB_TABLE: + TABLE: ANDROID: plugin_studentlife_audio_android IOS: plugin_studentlife_audio PROVIDERS: RAPIDS: - COMPUTE: False + COMPUTE: True FEATURES: ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration", "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","sumenergy", "avgenergy","sdenergy","minenergy","maxenergy","silencesensedfraction","noisesensedfraction", @@ -229,36 +242,42 @@ PHONE_CONVERSATION: SRC_LANGUAGE: "python" -HEARTRATE: - COMPUTE: False - DB_TABLE: fitbit_data - DAY_SEGMENTS: *day_segments - SUMMARY_FEATURES: ["restinghr"] # calories features' accuracy depend on the accuracy of the participants fitbit profile (e.g. heigh, weight) use with care: ["caloriesoutofrange", "caloriesfatburn", "caloriescardio", "caloriespeak"] - INTRADAY_FEATURES: ["maxhr", "minhr", "avghr", "medianhr", "modehr", "stdhr", "diffmaxmodehr", "diffminmodehr", "entropyhr", "minutesonoutofrangezone", "minutesonfatburnzone", "minutesoncardiozone", "minutesonpeakzone"] +FITBIT_HEARTRATE: + TABLE: "fitbit_data" + PARSE_JSON: TRUE + PROVIDERS: + RAPIDS: + COMPUTE: True + SUMMARY_FEATURES: ["restinghr"] # calories features' accuracy depend on the accuracy of the participants fitbit profile (e.g. height, weight) use these with care: ["caloriesoutofrange", "caloriesfatburn", "caloriescardio", "caloriespeak"] + INTRADAY_FEATURES: ["maxhr", "minhr", "avghr", "medianhr", "modehr", "stdhr", "diffmaxmodehr", "diffminmodehr", "entropyhr", "minutesonoutofrangezone", "minutesonfatburnzone", "minutesoncardiozone", "minutesonpeakzone"] -STEP: - COMPUTE: False - DB_TABLE: fitbit_data - DAY_SEGMENTS: *day_segments +FITBIT_STEPS: + TABLE: fitbit_data + PARSE_JSON: TRUE EXCLUDE_SLEEP: EXCLUDE: False - TYPE: FIXED # FIXED OR FITBIT_BASED (CONFIGURE FITBIT's SLEEP DB_TABLE) + TYPE: FIXED # FIXED OR FITBIT_BASED (configure FITBIT_SLEEP section) FIXED: START: "23:00" END: "07:00" - FEATURES: - ALL_STEPS: ["sumallsteps", "maxallsteps", "minallsteps", "avgallsteps", "stdallsteps"] - SEDENTARY_BOUT: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration"] - ACTIVE_BOUT: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration"] - THRESHOLD_ACTIVE_BOUT: 10 # steps - INCLUDE_ZERO_STEP_ROWS: False + PROVIDERS: + RAPIDS: + COMPUTE: TRUE + FEATURES: + ALL_STEPS: ["sumallsteps", "maxallsteps", "minallsteps", "avgallsteps", "stdallsteps"] + SEDENTARY_BOUT: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration"] + ACTIVE_BOUT: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration"] + THRESHOLD_ACTIVE_BOUT: 10 # steps + INCLUDE_ZERO_STEP_ROWS: False -SLEEP: - COMPUTE: False - DB_TABLE: fitbit_data - DAY_SEGMENTS: *day_segments - SLEEP_TYPES: ["main", "nap", "all"] - SUMMARY_FEATURES: ["sumdurationafterwakeup", "sumdurationasleep", "sumdurationawake", "sumdurationtofallasleep", "sumdurationinbed", "avgefficiency", "countepisode"] +FITBIT_SLEEP: + TABLE: fitbit_data + PARSE_JSON: TRUE + PROVIDERS: + RAPIDS: + COMPUTE: TRUE + SLEEP_TYPES: ["main", "nap", "all"] + SUMMARY_FEATURES: ["sumdurationafterwakeup", "sumdurationasleep", "sumdurationawake", "sumdurationtofallasleep", "sumdurationinbed", "avgefficiency", "countepisode"] ### Visualizations ################################################################ HEATMAP_FEATURES_CORRELATIONS: diff --git a/rules/features.smk b/rules/features.smk index 672c936f..75219830 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -6,28 +6,6 @@ rule join_features_from_providers: script: "../src/features/join_features_from_providers.R" -rule resample_episodes: - input: - "data/interim/{pid}/{sensor}_episodes.csv" - output: - "data/interim/{pid}/{sensor}_episodes_resampled.csv" - script: - "../src/features/utils/resample_episodes.R" - -rule resample_episodes_with_datetime: - input: - sensor_input = "data/interim/{pid}/{sensor}_episodes_resampled.csv", - day_segments = "data/interim/day_segments/{pid}_day_segments.csv" - params: - timezones = None, - fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"], - day_segments_type = config["DAY_SEGMENTS"]["TYPE"], - include_past_periodic_segments = config["DAY_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] - output: - "data/interim/{pid}/{sensor}_episodes_resampled_with_datetime.csv" - script: - "../src/data/readable_datetime.R" - rule phone_accelerometer_python_features: input: sensor_data = "data/raw/{pid}/phone_accelerometer_with_datetime.csv", @@ -234,48 +212,48 @@ rule phone_wifi_visible_r_features: script: "../src/features/entry.R" -rule fitbit_heartrate_features: - input: - heartrate_summary_data = "data/raw/{pid}/fitbit_heartrate_summary_with_datetime.csv", - heartrate_intraday_data = "data/raw/{pid}/fitbit_heartrate_intraday_with_datetime.csv" - params: - day_segment = "{day_segment}", - summary_features = config["HEARTRATE"]["SUMMARY_FEATURES"], - intraday_features = config["HEARTRATE"]["INTRADAY_FEATURES"] - output: - "data/processed/{pid}/fitbit_heartrate_{day_segment}.csv" - script: - "../src/features/fitbit_heartrate_features.py" +# rule fitbit_heartrate_features: +# input: +# heartrate_summary_data = "data/raw/{pid}/fitbit_heartrate_summary_with_datetime.csv", +# heartrate_intraday_data = "data/raw/{pid}/fitbit_heartrate_intraday_with_datetime.csv" +# params: +# day_segment = "{day_segment}", +# summary_features = config["HEARTRATE"]["SUMMARY_FEATURES"], +# intraday_features = config["HEARTRATE"]["INTRADAY_FEATURES"] +# output: +# "data/processed/{pid}/fitbit_heartrate_{day_segment}.csv" +# script: +# "../src/features/fitbit_heartrate_features.py" -rule fitbit_step_features: - input: - step_data = "data/raw/{pid}/fitbit_step_intraday_with_datetime.csv", - sleep_data = optional_steps_sleep_input - params: - day_segment = "{day_segment}", - features_all_steps = config["STEP"]["FEATURES"]["ALL_STEPS"], - features_sedentary_bout = config["STEP"]["FEATURES"]["SEDENTARY_BOUT"], - features_active_bout = config["STEP"]["FEATURES"]["ACTIVE_BOUT"], - threshold_active_bout = config["STEP"]["THRESHOLD_ACTIVE_BOUT"], - include_zero_step_rows = config["STEP"]["INCLUDE_ZERO_STEP_ROWS"], - exclude_sleep = config["STEP"]["EXCLUDE_SLEEP"]["EXCLUDE"], - exclude_sleep_type = config["STEP"]["EXCLUDE_SLEEP"]["TYPE"], - exclude_sleep_fixed_start = config["STEP"]["EXCLUDE_SLEEP"]["FIXED"]["START"], - exclude_sleep_fixed_end = config["STEP"]["EXCLUDE_SLEEP"]["FIXED"]["END"], - output: - "data/processed/{pid}/fitbit_step_{day_segment}.csv" - script: - "../src/features/fitbit_step_features.py" +# rule fitbit_step_features: +# input: +# step_data = "data/raw/{pid}/fitbit_step_intraday_with_datetime.csv", +# sleep_data = optional_steps_sleep_input +# params: +# day_segment = "{day_segment}", +# features_all_steps = config["STEP"]["FEATURES"]["ALL_STEPS"], +# features_sedentary_bout = config["STEP"]["FEATURES"]["SEDENTARY_BOUT"], +# features_active_bout = config["STEP"]["FEATURES"]["ACTIVE_BOUT"], +# threshold_active_bout = config["STEP"]["THRESHOLD_ACTIVE_BOUT"], +# include_zero_step_rows = config["STEP"]["INCLUDE_ZERO_STEP_ROWS"], +# exclude_sleep = config["STEP"]["EXCLUDE_SLEEP"]["EXCLUDE"], +# exclude_sleep_type = config["STEP"]["EXCLUDE_SLEEP"]["TYPE"], +# exclude_sleep_fixed_start = config["STEP"]["EXCLUDE_SLEEP"]["FIXED"]["START"], +# exclude_sleep_fixed_end = config["STEP"]["EXCLUDE_SLEEP"]["FIXED"]["END"], +# output: +# "data/processed/{pid}/fitbit_step_{day_segment}.csv" +# script: +# "../src/features/fitbit_step_features.py" -rule fitbit_sleep_features: - input: - sleep_summary_data = "data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv", - sleep_intraday_data = "data/raw/{pid}/fitbit_sleep_intraday_with_datetime.csv" - params: - day_segment = "{day_segment}", - summary_features = config["SLEEP"]["SUMMARY_FEATURES"], - sleep_types = config["SLEEP"]["SLEEP_TYPES"] - output: - "data/processed/{pid}/fitbit_sleep_{day_segment}.csv" - script: - "../src/features/fitbit_sleep_features.py" +# rule fitbit_sleep_features: +# input: +# sleep_summary_data = "data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv", +# sleep_intraday_data = "data/raw/{pid}/fitbit_sleep_intraday_with_datetime.csv" +# params: +# day_segment = "{day_segment}", +# summary_features = config["SLEEP"]["SUMMARY_FEATURES"], +# sleep_types = config["SLEEP"]["SLEEP_TYPES"] +# output: +# "data/processed/{pid}/fitbit_sleep_{day_segment}.csv" +# script: +# "../src/features/fitbit_sleep_features.py" diff --git a/rules/preprocessing.smk b/rules/preprocessing.smk index 4f332d4e..04e1088d 100644 --- a/rules/preprocessing.smk +++ b/rules/preprocessing.smk @@ -3,7 +3,7 @@ rule restore_sql_file: sql_file = "data/external/rapids_example.sql", db_credentials = ".env" params: - group = config["DOWNLOAD_PARTICIPANTS"]["GROUP"] + group = config["DATABASE_GROUP"] output: touch("data/interim/restore_sql_file.done") script: @@ -15,28 +15,40 @@ rule create_example_participant_files: shell: "echo 'a748ee1a-1d0b-4ae9-9074-279a2b6ba524\nandroid\ntest01\n2020/04/23,2020/05/04\n' >> ./data/external/example01 && echo '13dbc8a3-dae3-4834-823a-4bc96a7d459d\nios\ntest02\n2020/04/23,2020/05/04\n' >> ./data/external/example02" -rule download_participants: - params: - group = config["DOWNLOAD_PARTICIPANTS"]["GROUP"], - ignored_device_ids = config["DOWNLOAD_PARTICIPANTS"]["IGNORED_DEVICE_IDS"], - timezone = config["TIMEZONE"] - priority: 1 - script: - "../src/data/download_participants.R" +# rule download_participants: +# params: +# group = config["DOWNLOAD_PARTICIPANTS"]["GROUP"], +# ignored_device_ids = config["DOWNLOAD_PARTICIPANTS"]["IGNORED_DEVICE_IDS"], +# timezone = config["TIMEZONE"] +# priority: 1 +# script: +# "../src/data/download_participants.R" -rule download_dataset: +rule download_phone_data: input: - "data/external/{pid}" + "data/external/participant_files/{pid}.yaml" params: - group = config["DOWNLOAD_DATASET"]["GROUP"], - sensor = "{sensor}", - table = lambda wildcards: config[str(wildcards.sensor).upper()]["DB_TABLE"], + source = config["SENSOR_DATA"]["PHONE"]["SOURCE"], + sensor = "phone_" + "{sensor}", + table = lambda wildcards: config["PHONE_" + str(wildcards.sensor).upper()]["TABLE"], timezone = config["TIMEZONE"], - aware_multiplatform_tables = config["PHONE_ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"] + "," + config["PHONE_ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"] + "," + config["PHONE_CONVERSATION"]["DB_TABLE"]["ANDROID"] + "," + config["PHONE_CONVERSATION"]["DB_TABLE"]["IOS"], + aware_multiplatform_tables = config["PHONE_ACTIVITY_RECOGNITION"]["TABLE"]["ANDROID"] + "," + config["PHONE_ACTIVITY_RECOGNITION"]["TABLE"]["IOS"] + "," + config["PHONE_CONVERSATION"]["TABLE"]["ANDROID"] + "," + config["PHONE_CONVERSATION"]["TABLE"]["IOS"], output: - "data/raw/{pid}/{sensor}_raw.csv" + "data/raw/{pid}/phone_{sensor}_raw.csv" script: - "../src/data/download_dataset.R" + "../src/data/download_phone_data.R" + +rule download_fitbit_data: + input: + "data/external/participant_files/{pid}.yaml" + params: + source = config["SENSOR_DATA"]["FITBIT"]["SOURCE"], + sensor = "fitbit_" + "{sensor}", + table = lambda wildcards: config["FITBIT_" + str(wildcards.sensor).upper()]["TABLE"], + output: + "data/raw/{pid}/fitbit_{sensor}_raw.csv" + script: + "../src/data/download_fitbit_data.R" rule compute_day_segments: input: @@ -55,8 +67,8 @@ rule phone_readable_datetime: sensor_input = "data/raw/{pid}/phone_{sensor}_raw.csv", day_segments = "data/interim/day_segments/{pid}_day_segments.csv" params: - timezones = None, - fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"], + timezones = config["SENSOR_DATA"]["PHONE"]["TIMEZONE"]["TYPE"], + fixed_timezone = config["SENSOR_DATA"]["PHONE"]["TIMEZONE"]["VALUE"], day_segments_type = config["DAY_SEGMENTS"]["TYPE"], include_past_periodic_segments = config["DAY_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] output: @@ -97,7 +109,7 @@ rule phone_valid_sensed_days: rule unify_ios_android: input: sensor_data = "data/raw/{pid}/{sensor}_with_datetime.csv", - participant_info = "data/external/{pid}" + participant_info = "data/external/participant_files/{pid}.yaml" params: sensor = "{sensor}", output: @@ -105,7 +117,7 @@ rule unify_ios_android: script: "../src/data/unify_ios_android.R" -rule process_phone_location_types: +rule process_phone_locations_types: input: locations = "data/raw/{pid}/phone_locations_raw.csv", phone_sensed_timestamps = "data/interim/{pid}/phone_sensed_timestamps.csv", @@ -118,13 +130,13 @@ rule process_phone_location_types: script: "../src/data/process_location_types.R" -rule readable_datetime_location_processed: +rule phone_locations_processed_with_datetime: input: sensor_input = "data/interim/{pid}/phone_locations_processed.csv", day_segments = "data/interim/day_segments/{pid}_day_segments.csv" params: - timezones = None, - fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"], + timezones = config["SENSOR_DATA"]["PHONE"]["TIMEZONE"]["TYPE"], + fixed_timezone = config["SENSOR_DATA"]["PHONE"]["TIMEZONE"]["VALUE"], day_segments_type = config["DAY_SEGMENTS"]["TYPE"], include_past_periodic_segments = config["DAY_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] output: @@ -132,6 +144,28 @@ rule readable_datetime_location_processed: script: "../src/data/readable_datetime.R" +rule resample_episodes: + input: + "data/interim/{pid}/{sensor}_episodes.csv" + output: + "data/interim/{pid}/{sensor}_episodes_resampled.csv" + script: + "../src/features/utils/resample_episodes.R" + +rule resample_episodes_with_datetime: + input: + sensor_input = "data/interim/{pid}/{sensor}_episodes_resampled.csv", + day_segments = "data/interim/day_segments/{pid}_day_segments.csv" + params: + timezones = config["SENSOR_DATA"]["PHONE"]["TIMEZONE"]["TYPE"], + fixed_timezone = config["SENSOR_DATA"]["PHONE"]["TIMEZONE"]["VALUE"], + day_segments_type = config["DAY_SEGMENTS"]["TYPE"], + include_past_periodic_segments = config["DAY_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] + output: + "data/interim/{pid}/{sensor}_episodes_resampled_with_datetime.csv" + script: + "../src/data/readable_datetime.R" + rule phone_application_categories: input: "data/raw/{pid}/phone_applications_foreground_with_datetime.csv" @@ -145,37 +179,37 @@ rule phone_application_categories: script: "../src/data/application_categories.R" -rule fitbit_heartrate_with_datetime: - input: - expand("data/raw/{{pid}}/{fitbit_table}_raw.csv", fitbit_table=config["HEARTRATE"]["DB_TABLE"]) - params: - local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"], - fitbit_sensor = "heartrate" - output: - summary_data = "data/raw/{pid}/fitbit_heartrate_summary_with_datetime.csv", - intraday_data = "data/raw/{pid}/fitbit_heartrate_intraday_with_datetime.csv" - script: - "../src/data/fitbit_readable_datetime.py" +# rule fitbit_heartrate_with_datetime: +# input: +# expand("data/raw/{{pid}}/{fitbit_table}_raw.csv", fitbit_table=config["HEARTRATE"]["TABLE"]) +# params: +# local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"], +# fitbit_sensor = "heartrate" +# output: +# summary_data = "data/raw/{pid}/fitbit_heartrate_summary_with_datetime.csv", +# intraday_data = "data/raw/{pid}/fitbit_heartrate_intraday_with_datetime.csv" +# script: +# "../src/data/fitbit_readable_datetime.py" -rule fitbit_step_with_datetime: - input: - expand("data/raw/{{pid}}/{fitbit_table}_raw.csv", fitbit_table=config["STEP"]["DB_TABLE"]) - params: - local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"], - fitbit_sensor = "steps" - output: - intraday_data = "data/raw/{pid}/fitbit_step_intraday_with_datetime.csv" - script: - "../src/data/fitbit_readable_datetime.py" +# rule fitbit_step_with_datetime: +# input: +# expand("data/raw/{{pid}}/{fitbit_table}_raw.csv", fitbit_table=config["STEP"]["TABLE"]) +# params: +# local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"], +# fitbit_sensor = "steps" +# output: +# intraday_data = "data/raw/{pid}/fitbit_step_intraday_with_datetime.csv" +# script: +# "../src/data/fitbit_readable_datetime.py" -rule fitbit_sleep_with_datetime: - input: - expand("data/raw/{{pid}}/{fitbit_table}_raw.csv", fitbit_table=config["SLEEP"]["DB_TABLE"]) - params: - local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"], - fitbit_sensor = "sleep" - output: - summary_data = "data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv", - intraday_data = "data/raw/{pid}/fitbit_sleep_intraday_with_datetime.csv" - script: - "../src/data/fitbit_readable_datetime.py" +# rule fitbit_sleep_with_datetime: +# input: +# expand("data/raw/{{pid}}/{fitbit_table}_raw.csv", fitbit_table=config["SLEEP"]["TABLE"]) +# params: +# local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"], +# fitbit_sensor = "sleep" +# output: +# summary_data = "data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv", +# intraday_data = "data/raw/{pid}/fitbit_sleep_intraday_with_datetime.csv" +# script: +# "../src/data/fitbit_readable_datetime.py" diff --git a/rules/reports.smk b/rules/reports.smk index 13064a02..0a9d74ec 100644 --- a/rules/reports.smk +++ b/rules/reports.smk @@ -66,7 +66,7 @@ rule overall_compliance_heatmap: pid_files = expand("data/external/{pid}", pid=config["PIDS"]) params: only_show_valid_days = config["OVERALL_COMPLIANCE_HEATMAP"]["ONLY_SHOW_VALID_DAYS"], - local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"], + local_timezone = config["SENSOR_DATA"]["PHONE"]["TIMEZONE"]["VALUE"], expected_num_of_days = config["OVERALL_COMPLIANCE_HEATMAP"]["EXPECTED_NUM_OF_DAYS"], bin_size = config["OVERALL_COMPLIANCE_HEATMAP"]["BIN_SIZE"], min_bins_per_hour = "{min_valid_bins_per_hour}" diff --git a/src/data/download_fitbit_data.R b/src/data/download_fitbit_data.R new file mode 100644 index 00000000..19f01674 --- /dev/null +++ b/src/data/download_fitbit_data.R @@ -0,0 +1,40 @@ +source("renv/activate.R") +library(RMySQL) +library(dplyr) +library(readr) +library(stringr) +library(yaml) + + +participant_file <- snakemake@input[[1]] +source <- snakemake@params[["source"]] +sensor <- snakemake@params[["sensor"]] +table <- snakemake@params[["table"]] +sensor_file <- snakemake@output[[1]] + +participant <- read_yaml(participant_file) +if(! "FITBIT" %in% names(participant)){ + stop(paste("The following participant file does not have a FITBIT section, create one manually or automatically (see the docs):", participant_file)) +} +device_ids <- participant$FITBIT$DEVICE_IDS +unified_device_id <- tail(device_ids, 1) +# As opposed to phone data, we dont' filter by date here because data can still be in JSON format, we need to parse it first + +if(source$TYPE == "DATABASE"){ + dbEngine <- dbConnect(MySQL(), default.file = "./.env", group = source$DATABASE_GROUP) + query <- paste0("SELECT * FROM ", table, " WHERE ",source$DEVICE_ID_COLUMN," IN ('", paste0(device_ids, collapse = "','"), "')") + sensor_data <- dbGetQuery(dbEngine, query) + dbDisconnect(dbEngine) + sensor_data <- sensor_data %>% + rename(device_id = source$DEVICE_ID_COLUMN) %>% + mutate(device_id = unified_device_id) # Unify device_id + + if(FALSE) # For MoSHI use, we didn't split fitbit sensors into different tables + sensor_data <- sensor_data %>% filter(fitbit_data_type == str_split(sensor, "_", simplify = TRUE)[[2]]) + + # Droping duplicates on all columns except for _id or id + sensor_data <- sensor_data %>% distinct(!!!syms(setdiff(names(sensor_data), c("_id", "id")))) + + write_csv(sensor_data, sensor_file) + +} diff --git a/src/data/download_dataset.R b/src/data/download_phone_data.R similarity index 58% rename from src/data/download_dataset.R rename to src/data/download_phone_data.R index 4876c5cf..d6c9dd10 100644 --- a/src/data/download_dataset.R +++ b/src/data/download_phone_data.R @@ -4,6 +4,9 @@ library(RMySQL) library(stringr) library(dplyr) library(readr) +library(yaml) +library(lubridate) +options(scipen=999) validate_deviceid_platforms <- function(device_ids, platforms){ if(length(device_ids) == 1){ @@ -37,38 +40,57 @@ is_multiplaform_participant <- function(dbEngine, device_ids, platforms){ return(FALSE) } -participant <- snakemake@input[[1]] -group <- snakemake@params[["group"]] +get_timestamp_filter <- function(device_ids, participant, timezone){ + # Read start and end date from the participant file to filter data within that range + start_date <- ymd_hms(paste(participant$PHONE$START_DATE,"00:00:00"), tz=timezone, quiet=TRUE) + end_date <- ymd_hms(paste(participant$PHONE$END_DATE, "23:59:59"), tz=timezone, quiet=TRUE) + start_timestamp = as.numeric(start_date) * 1000 + end_timestamp = as.numeric(end_date) * 1000 + if(is.na(start_timestamp)){ + message(paste("PHONE[START_DATE] was not provided or failed to parse (", participant$PHONE$START_DATE,"), all data for", paste0(device_ids, collapse=","),"is returned")) + return("") + }else if(is.na(end_timestamp)){ + message(paste("PHONE[END_DATE] was not provided or failed to parse (", participant$PHONE$END_DATE,"), all data for", paste0(device_ids, collapse=","),"is returned")) + return("") + } else if(start_timestamp > end_timestamp){ + stop(paste("Start date has to be before end date in PHONE[TIME_SPAN] (",start_date,",", date(end_date),"), all data for", paste0(device_ids, collapse=","),"is returned")) + return("") + } else { + message(paste("Filtering data between", start_date, "and", end_date, "in", timezone, "for",paste0(device_ids, collapse=","))) + return(paste0("AND timestamp BETWEEN ", start_timestamp, " AND ", end_timestamp)) + } +} + +participant_file <- snakemake@input[[1]] +source <- snakemake@params[["source"]] +group <- source$DATABASE_GROUP table <- snakemake@params[["table"]] sensor <- snakemake@params[["sensor"]] timezone <- snakemake@params[["timezone"]] aware_multiplatform_tables <- str_split(snakemake@params[["aware_multiplatform_tables"]], ",")[[1]] sensor_file <- snakemake@output[[1]] -device_ids <- strsplit(readLines(participant, n=1), ",")[[1]] +participant <- read_yaml(participant_file) +if(! "PHONE" %in% names(participant)){ + stop(paste("The following participant file does not have a PHONE section, create one manually or automatically (see the docs):", participant_file)) +} +device_ids <- participant$PHONE$DEVICE_IDS unified_device_id <- tail(device_ids, 1) -platforms <- strsplit(readLines(participant, n=2)[[2]], ",")[[1]] +platforms <- participant$PHONE$PLATFORMS validate_deviceid_platforms(device_ids, platforms) - -# Read start and end date from the participant file to filter data within that range -start_date <- strsplit(readLines(participant, n=4)[4], ",")[[1]][1] -end_date <- strsplit(readLines(participant, n=4)[4], ",")[[1]][2] -start_datetime_utc = format(as.POSIXct(paste0(start_date, " 00:00:00"),format="%Y/%m/%d %H:%M:%S",origin="1970-01-01",tz=timezone), tz="UTC") -end_datetime_utc = format(as.POSIXct(paste0(end_date, " 23:59:59"),format="%Y/%m/%d %H:%M:%S",origin="1970-01-01",tz=timezone), tz="UTC") +timestamp_filter <- get_timestamp_filter(device_ids, participant, timezone) dbEngine <- dbConnect(MySQL(), default.file = "./.env", group = group) if(is_multiplaform_participant(dbEngine, device_ids, platforms)){ - sensor_data <- unify_raw_data(dbEngine, table, sensor, start_datetime_utc, end_datetime_utc, aware_multiplatform_tables, device_ids, platforms) + sensor_data <- unify_raw_data(dbEngine, table, sensor, timestamp_filter, aware_multiplatform_tables, device_ids, platforms) }else { # table has two elements for conversation and activity recognition (they store data on a different table for ios and android) - if(length(table) > 1){ + if(length(table) > 1) table <- table[[toupper(platforms[1])]] - } - query <- paste0("SELECT * FROM ", table, " WHERE device_id IN ('", paste0(device_ids, collapse = "','"), "')") - if(!(is.na(start_datetime_utc)) && !(is.na(end_datetime_utc)) && start_datetime_utc < end_datetime_utc) - query <- paste0(query, "AND timestamp BETWEEN 1000*UNIX_TIMESTAMP('", start_datetime_utc, "') AND 1000*UNIX_TIMESTAMP('", end_datetime_utc, "')") - sensor_data <- dbGetQuery(dbEngine, query) + query <- paste0("SELECT * FROM ", table, " WHERE ",source$DEVICE_ID_COLUMN," IN ('", paste0(device_ids, collapse = "','"), "')", timestamp_filter) + sensor_data <- dbGetQuery(dbEngine, query) %>% + rename(device_id = source$DEVICE_ID_COLUMN) } sensor_data <- sensor_data %>% arrange(timestamp) diff --git a/src/data/unify_ios_android.R b/src/data/unify_ios_android.R index bc93bada..48ed3efc 100644 --- a/src/data/unify_ios_android.R +++ b/src/data/unify_ios_android.R @@ -1,11 +1,13 @@ source("renv/activate.R") source("src/data/unify_utils.R") +library(yaml) sensor_data <- read.csv(snakemake@input[["sensor_data"]], stringsAsFactors = FALSE) participant_info <- snakemake@input[["participant_info"]] sensor <- snakemake@params[["sensor"]] -platforms <- strsplit(readLines(participant_info, n=2)[[2]], ",")[[1]] +participant <- read_yaml(participant_info) +platforms <- participant$PHONE$PLATFORMS platform <- ifelse(platforms[1] == "multiple" | (length(platforms) > 1 & "android" %in% platforms & "ios" %in% platforms), "android", platforms[1]) sensor_data <- unify_data(sensor_data, sensor, platform) diff --git a/src/data/unify_utils.R b/src/data/unify_utils.R index 1437b62f..d82ed2ae 100644 --- a/src/data/unify_utils.R +++ b/src/data/unify_utils.R @@ -138,7 +138,7 @@ unify_ios_conversation <- function(conversation){ } # This function is used in download_dataset.R -unify_raw_data <- function(dbEngine, sensor_table, sensor, start_datetime_utc, end_datetime_utc, aware_multiplatform_tables, device_ids, platforms){ +unify_raw_data <- function(dbEngine, sensor_table, sensor, timestamp_filter, aware_multiplatform_tables, device_ids, platforms){ # If platforms is 'multiple', fetch each device_id's platform from aware_device, otherwise, use those given by the user if(length(platforms) == 1 && platforms == "multiple") devices_platforms <- dbGetQuery(dbEngine, paste0("SELECT device_id,brand FROM aware_device WHERE device_id IN ('", paste0(device_ids, collapse = "','"), "')")) %>% @@ -169,10 +169,7 @@ unify_raw_data <- function(dbEngine, sensor_table, sensor, start_datetime_utc, e table <- conversation_tables[[platform]] if(table %in% available_tables_in_db){ - query <- paste0("SELECT * FROM ", table, " WHERE device_id IN ('", device_id, "')") - if(!(is.na(start_datetime_utc)) && !(is.na(end_datetime_utc)) && start_datetime_utc < end_datetime_utc){ - query <- paste0(query, "AND timestamp BETWEEN 1000*UNIX_TIMESTAMP('", start_datetime_utc, "') AND 1000*UNIX_TIMESTAMP('", end_datetime_utc, "')") - } + query <- paste0("SELECT * FROM ", table, " WHERE device_id IN ('", device_id, "')", timestamp_filter) sensor_data <- unify_data(dbGetQuery(dbEngine, query), sensor, platform) participants_sensordata <- append(participants_sensordata, list(sensor_data)) }else{ diff --git a/tools/update_format_participant_files.py b/tools/update_format_participant_files.py new file mode 100644 index 00000000..3c6a0934 --- /dev/null +++ b/tools/update_format_participant_files.py @@ -0,0 +1,37 @@ +#!/usr/bin/python + +from pathlib import Path +import yaml, os +import sys +p = Path(r'data/external/').glob('*') +files = [x for x in p if x.is_file() and x.suffix == "" and "." not in x.stem] +for file in files: + reader = open(file, 'r') + phone = {"DEVICES_IDS" :"", "PLATFORMS" :"", "LABEL" :"", "START_DATE" :"", "END_DATE" :""} + lines = reader.read().splitlines() + if(len(lines) >=1 and len(lines[0]) > 0): + phone["DEVICE_IDS"] = lines[0] + if(len(lines) >=2 and len(lines[1]) > 0): + phone["PLATFORMS"] = lines[1] + if(len(lines) >=3 and len(lines[2]) > 0): + phone["LABEL"] = lines[2] + if(len(lines) >=4 and len(lines[3]) > 0): + phone["START_DATE"] = lines[3].split(",")[0] + phone["END_DATE"] = lines[3].split(",")[1] + new_participant_file = Path(r'data/external/participant_files/') / (file.stem + ".yaml") + os.makedirs(os.path.dirname(new_participant_file), exist_ok=True) + with open(new_participant_file, 'w') as writer: + writer.write("PHONE:\n") + writer.write(" DEVICE_IDS: [{}]\n".format(phone["DEVICE_IDS"])) + writer.write(" PLATFORMS: [{}]\n".format(phone["PLATFORMS"])) + writer.write(" LABEL: {}\n".format(phone["LABEL"])) + writer.write(" START_DATE: {}\n".format(phone["START_DATE"])) + writer.write(" END_DATE: {}\n".format(phone["END_DATE"])) + + writer.write("FITBIT:\n") + writer.write(" DEVICE_IDS: [{}]\n".format(phone["DEVICE_IDS"])) + writer.write(" LABEL: {}\n".format(phone["LABEL"])) + writer.write(" START_DATE: {}\n".format(phone["START_DATE"])) + writer.write(" END_DATE: {}\n".format(phone["END_DATE"])) +print("Processed files:") +print(list(map(str, files))) \ No newline at end of file