rapids/rules/preprocessing.smk

296 lines
13 KiB
Plaintext
Raw Normal View History

rule restore_sql_file:
input:
sql_file = "data/external/rapids_example.sql",
db_credentials = ".env"
params:
group = config["DATABASE_GROUP"]
output:
touch("data/interim/restore_sql_file.done")
script:
"../src/data/restore_sql_file.py"
2020-08-03 23:30:15 +02:00
rule create_example_participant_files:
output:
expand("data/external/participant_files/{pid}.yaml", pid = ["example01", "example02"])
2020-08-03 23:30:15 +02:00
shell:
"echo 'PHONE:\n DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524]\n PLATFORMS: [android]\n LABEL: test-01\n START_DATE: 2020-04-23\n END_DATE: 2020-05-04\nFITBIT:\n DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524]\n LABEL: test-01\n START_DATE: 2020-04-23\n END_DATE: 2020-05-04\n' >> ./data/external/participant_files/example01.yaml && echo 'PHONE:\n DEVICE_IDS: [13dbc8a3-dae3-4834-823a-4bc96a7d459d]\n PLATFORMS: [ios]\n LABEL: test-02\n START_DATE: 2020-04-23\n END_DATE: 2020-05-04\nFITBIT:\n DEVICE_IDS: [13dbc8a3-dae3-4834-823a-4bc96a7d459d]\n LABEL: test-02\n START_DATE: 2020-04-23\n END_DATE: 2020-05-04\n' >> ./data/external/participant_files/example02.yaml"
2020-08-03 23:30:15 +02:00
rule create_participants_files:
input:
participants_file = [] if config["CREATE_PARTICIPANT_FILES"]["SOURCE"]["TYPE"] == "AWARE_DEVICE_TABLE" else config["CREATE_PARTICIPANT_FILES"]["SOURCE"]["CSV_FILE_PATH"]
params:
config = config["CREATE_PARTICIPANT_FILES"]
script:
"../src/data/create_participants_files.R"
rule pull_phone_data:
input: unpack(pull_phone_data_input_with_mutation_scripts)
params:
data_configuration = config["PHONE_DATA_STREAMS"][config["PHONE_DATA_STREAMS"]["USE"]],
sensor = "phone_" + "{sensor}",
tables = lambda wildcards: config["PHONE_" + str(wildcards.sensor).upper()]["TABLE"],
output:
"data/raw/{pid}/phone_{sensor}_raw.csv"
script:
"../src/data/streams/pull_phone_data.R"
rule download_fitbit_data:
2019-10-24 18:11:24 +02:00
input:
participant_file = "data/external/participant_files/{pid}.yaml",
input_file = [] if config["FITBIT_DATA_CONFIGURATION"]["SOURCE"]["TYPE"] == "DATABASE" else lambda wildcards: config["FITBIT_" + str(wildcards.sensor).upper()]["TABLE"]
2019-10-24 18:11:24 +02:00
params:
data_configuration = config["FITBIT_DATA_CONFIGURATION"],
sensor = "fitbit_" + "{sensor}",
table = lambda wildcards: config["FITBIT_" + str(wildcards.sensor).upper()]["TABLE"],
2019-10-24 18:11:24 +02:00
output:
"data/raw/{pid}/fitbit_{sensor}_raw.csv"
2019-10-24 18:11:24 +02:00
script:
"../src/data/download_fitbit_data.R"
2019-10-24 22:08:05 +02:00
2020-12-03 00:41:03 +01:00
rule compute_time_segments:
input:
2020-12-03 00:41:03 +01:00
config["TIME_SEGMENTS"]["FILE"],
"data/external/participant_files/{pid}.yaml"
params:
2020-12-03 00:41:03 +01:00
time_segments_type = config["TIME_SEGMENTS"]["TYPE"],
2020-09-14 20:21:36 +02:00
pid = "{pid}"
output:
2020-12-03 00:41:03 +01:00
segments_file = "data/interim/time_segments/{pid}_time_segments.csv",
segments_labels_file = "data/interim/time_segments/{pid}_time_segments_labels.csv",
script:
2020-12-03 00:41:03 +01:00
"../src/data/compute_time_segments.py"
2020-10-19 21:07:12 +02:00
rule phone_readable_datetime:
2019-10-24 22:08:05 +02:00
input:
2020-10-19 21:07:12 +02:00
sensor_input = "data/raw/{pid}/phone_{sensor}_raw.csv",
2021-03-05 23:49:37 +01:00
time_segments = "data/interim/time_segments/{pid}_time_segments.csv",
pid_file = "data/external/participant_files/{pid}.yaml",
tzcodes_file = input_tzcodes_file,
2019-10-24 22:08:05 +02:00
params:
2021-03-05 23:49:37 +01:00
device_type = "phone",
timezone_parameters = config["TIMEZONE"],
pid = "{pid}",
2020-12-03 00:41:03 +01:00
time_segments_type = config["TIME_SEGMENTS"]["TYPE"],
include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
2019-10-24 22:08:05 +02:00
output:
2020-10-19 21:07:12 +02:00
"data/raw/{pid}/phone_{sensor}_with_datetime.csv"
2019-10-24 22:08:05 +02:00
script:
2021-03-05 23:49:37 +01:00
"../src/data/datetime/readable_datetime.R"
2019-11-05 18:34:22 +01:00
2020-11-25 01:12:16 +01:00
rule phone_yielded_timestamps:
2019-11-05 18:34:22 +01:00
input:
2020-11-25 01:12:16 +01:00
all_sensors = expand("data/raw/{{pid}}/{sensor}_raw.csv", sensor = map(str.lower, config["PHONE_DATA_YIELD"]["SENSORS"]))
params:
sensors = config["PHONE_DATA_YIELD"]["SENSORS"] # not used but needed so the rule is triggered if this array changes
2019-11-05 18:34:22 +01:00
output:
2020-11-25 01:12:16 +01:00
"data/interim/{pid}/phone_yielded_timestamps.csv"
2019-11-05 18:34:22 +01:00
script:
2020-11-25 01:12:16 +01:00
"../src/data/phone_yielded_timestamps.R"
2020-11-25 01:12:16 +01:00
rule phone_yielded_timestamps_with_datetime:
2020-10-07 17:51:31 +02:00
input:
2020-11-25 01:12:16 +01:00
sensor_input = "data/interim/{pid}/phone_yielded_timestamps.csv",
2021-03-05 23:49:37 +01:00
time_segments = "data/interim/time_segments/{pid}_time_segments.csv",
pid_file = "data/external/participant_files/{pid}.yaml",
tzcodes_file = input_tzcodes_file,
2020-11-25 01:12:16 +01:00
params:
2021-03-05 23:49:37 +01:00
device_type = "phone",
timezone_parameters = config["TIMEZONE"],
pid = "{pid}",
2020-12-03 00:41:03 +01:00
time_segments_type = config["TIME_SEGMENTS"]["TYPE"],
include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
2020-10-07 17:51:31 +02:00
output:
2020-11-25 01:12:16 +01:00
"data/interim/{pid}/phone_yielded_timestamps_with_datetime.csv"
2020-10-07 17:51:31 +02:00
script:
2021-03-05 23:49:37 +01:00
"../src/data/datetime/readable_datetime.R"
2020-10-07 17:51:31 +02:00
rule unify_ios_android:
input:
sensor_data = "data/raw/{pid}/{sensor}_with_datetime.csv",
participant_info = "data/external/participant_files/{pid}.yaml"
params:
sensor = "{sensor}",
output:
"data/raw/{pid}/{sensor}_with_datetime_unified.csv"
script:
2019-12-10 00:23:00 +01:00
"../src/data/unify_ios_android.R"
rule process_phone_locations_types:
2019-12-10 00:23:00 +01:00
input:
2020-10-19 21:07:12 +02:00
locations = "data/raw/{pid}/phone_locations_raw.csv",
phone_sensed_timestamps = "data/interim/{pid}/phone_yielded_timestamps.csv",
2019-12-10 00:23:00 +01:00
params:
2020-10-19 21:07:12 +02:00
consecutive_threshold = config["PHONE_LOCATIONS"]["FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD"],
time_since_valid_location = config["PHONE_LOCATIONS"]["FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION"],
locations_to_use = config["PHONE_LOCATIONS"]["LOCATIONS_TO_USE"]
2019-12-10 00:23:00 +01:00
output:
2020-10-19 21:07:12 +02:00
"data/interim/{pid}/phone_locations_processed.csv"
2019-12-10 00:23:00 +01:00
script:
"../src/data/process_location_types.R"
2020-01-15 23:18:10 +01:00
rule phone_locations_processed_with_datetime:
2020-10-07 17:51:31 +02:00
input:
2020-10-19 21:07:12 +02:00
sensor_input = "data/interim/{pid}/phone_locations_processed.csv",
2021-03-05 23:49:37 +01:00
time_segments = "data/interim/time_segments/{pid}_time_segments.csv",
pid_file = "data/external/participant_files/{pid}.yaml",
tzcodes_file = input_tzcodes_file,
2020-10-07 17:51:31 +02:00
params:
2021-03-05 23:49:37 +01:00
device_type = "phone",
timezone_parameters = config["TIMEZONE"],
pid = "{pid}",
2020-12-03 00:41:03 +01:00
time_segments_type = config["TIME_SEGMENTS"]["TYPE"],
include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
2020-10-07 17:51:31 +02:00
output:
2020-10-19 21:07:12 +02:00
"data/interim/{pid}/phone_locations_processed_with_datetime.csv"
2020-10-07 17:51:31 +02:00
script:
2021-03-05 23:49:37 +01:00
"../src/data/datetime/readable_datetime.R"
2020-10-07 17:51:31 +02:00
rule phone_locations_processed_with_datetime_with_home:
input:
sensor_input = "data/interim/{pid}/phone_locations_processed_with_datetime.csv"
params:
dbscan_eps = config["PHONE_LOCATIONS"]["HOME_INFERENCE"]["DBSCAN_EPS"],
dbscan_minsamples = config["PHONE_LOCATIONS"]["HOME_INFERENCE"]["DBSCAN_MINSAMPLES"],
threshold_static = config["PHONE_LOCATIONS"]["HOME_INFERENCE"]["THRESHOLD_STATIC"],
clustering_algorithm = config["PHONE_LOCATIONS"]["HOME_INFERENCE"]["CLUSTERING_ALGORITHM"]
output:
"data/interim/{pid}/phone_locations_processed_with_datetime_with_home.csv"
script:
"../src/data/infer_home_location.py"
rule resample_episodes:
input:
"data/interim/{pid}/{sensor}_episodes.csv"
output:
"data/interim/{pid}/{sensor}_episodes_resampled.csv"
script:
"../src/features/utils/resample_episodes.R"
rule resample_episodes_with_datetime:
2020-01-15 23:18:10 +01:00
input:
sensor_input = "data/interim/{pid}/{sensor}_episodes_resampled.csv",
2021-03-05 23:49:37 +01:00
time_segments = "data/interim/time_segments/{pid}_time_segments.csv",
pid_file = "data/external/participant_files/{pid}.yaml",
tzcodes_file = input_tzcodes_file,
2020-01-15 23:18:10 +01:00
params:
2021-03-05 23:49:37 +01:00
device_type = lambda wildcards: wildcards.sensor.split("_")[0],
timezone_parameters = config["TIMEZONE"],
pid = "{pid}",
2020-12-03 00:41:03 +01:00
time_segments_type = config["TIME_SEGMENTS"]["TYPE"],
include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
2020-01-15 23:18:10 +01:00
output:
"data/interim/{pid}/{sensor}_episodes_resampled_with_datetime.csv"
2020-01-15 23:18:10 +01:00
script:
2021-03-05 23:49:37 +01:00
"../src/data/datetime/readable_datetime.R"
rule phone_application_categories:
input:
"data/raw/{pid}/phone_applications_{type}_with_datetime.csv"
params:
catalogue_source = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["CATALOGUE_SOURCE"],
catalogue_file = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["CATALOGUE_FILE"],
update_catalogue_file = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["UPDATE_CATALOGUE_FILE"],
scrape_missing_genres = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["SCRAPE_MISSING_CATEGORIES"]
output:
"data/raw/{pid}/phone_applications_{type}_with_datetime_with_categories.csv"
script:
"../src/data/application_categories.R"
2020-10-22 19:08:52 +02:00
rule fitbit_parse_heartrate:
input:
participant_file = "data/external/participant_files/{pid}.yaml",
raw_data = "data/raw/{pid}/fitbit_heartrate_{fitbit_data_type}_raw.csv"
2020-10-22 19:08:52 +02:00
params:
timezone = config["FITBIT_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"],
table = lambda wildcards: config["FITBIT_HEARTRATE_"+str(wildcards.fitbit_data_type).upper()]["TABLE"],
column_format = config["FITBIT_DATA_CONFIGURATION"]["SOURCE"]["COLUMN_FORMAT"],
fitbit_data_type = "{fitbit_data_type}"
2020-10-22 19:08:52 +02:00
output:
"data/raw/{pid}/fitbit_heartrate_{fitbit_data_type}_parsed.csv"
2020-10-22 19:08:52 +02:00
script:
"../src/data/fitbit_parse_heartrate.py"
2020-10-22 19:08:52 +02:00
rule fitbit_parse_steps:
input:
participant_file = "data/external/participant_files/{pid}.yaml",
raw_data = "data/raw/{pid}/fitbit_steps_{fitbit_data_type}_raw.csv"
2020-10-22 19:08:52 +02:00
params:
timezone = config["FITBIT_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"],
table = lambda wildcards: config["FITBIT_STEPS_"+str(wildcards.fitbit_data_type).upper()]["TABLE"],
column_format = config["FITBIT_DATA_CONFIGURATION"]["SOURCE"]["COLUMN_FORMAT"],
fitbit_data_type = "{fitbit_data_type}"
2020-10-22 19:08:52 +02:00
output:
"data/raw/{pid}/fitbit_steps_{fitbit_data_type}_parsed.csv"
2020-10-22 19:08:52 +02:00
script:
"../src/data/fitbit_parse_steps.py"
rule fitbit_parse_sleep:
input:
participant_file = "data/external/participant_files/{pid}.yaml",
raw_data = "data/raw/{pid}/fitbit_sleep_{fitbit_data_type}_raw.csv"
params:
timezone = config["FITBIT_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"],
table = lambda wildcards: config["FITBIT_SLEEP_"+str(wildcards.fitbit_data_type).upper()]["TABLE"],
column_format = config["FITBIT_DATA_CONFIGURATION"]["SOURCE"]["COLUMN_FORMAT"],
fitbit_data_type = "{fitbit_data_type}",
sleep_episode_timestamp = config["FITBIT_SLEEP_SUMMARY"]["SLEEP_EPISODE_TIMESTAMP"]
output:
"data/raw/{pid}/fitbit_sleep_{fitbit_data_type}_parsed.csv"
script:
"../src/data/fitbit_parse_sleep.py"
# rule fitbit_parse_calories:
# input:
# data = expand("data/raw/{{pid}}/fitbit_calories_{fitbit_data_type}_raw.csv", fitbit_data_type = (["json"] if config["FITBIT_CALORIES"]["TABLE_FORMAT"] == "JSON" else ["summary", "intraday"]))
# params:
# timezone = config["FITBIT_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"],
# table = config["FITBIT_CALORIES"]["TABLE"],
# table_format = config["FITBIT_CALORIES"]["TABLE_FORMAT"]
# output:
# summary_data = "data/raw/{pid}/fitbit_calories_summary_parsed.csv",
# intraday_data = "data/raw/{pid}/fitbit_calories_intraday_parsed.csv"
# script:
# "../src/data/fitbit_parse_calories.py"
2020-10-22 19:08:52 +02:00
rule fitbit_readable_datetime:
input:
sensor_input = "data/raw/{pid}/fitbit_{sensor}_{fitbit_data_type}_parsed.csv",
2020-12-03 00:41:03 +01:00
time_segments = "data/interim/time_segments/{pid}_time_segments.csv"
2020-10-22 19:08:52 +02:00
params:
fixed_timezone = config["FITBIT_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"],
2020-12-03 00:41:03 +01:00
time_segments_type = config["TIME_SEGMENTS"]["TYPE"],
include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
2020-10-22 19:08:52 +02:00
output:
"data/raw/{pid}/fitbit_{sensor}_{fitbit_data_type}_parsed_with_datetime.csv"
script:
"../src/data/readable_datetime.R"
2020-12-15 02:30:34 +01:00
rule pull_empatica_data:
input: unpack(pull_empatica_data_input_with_mutation_scripts)
2020-12-15 02:30:34 +01:00
params:
data_configuration = config["EMPATICA_DATA_STREAMS"][config["EMPATICA_DATA_STREAMS"]["USE"]],
sensor = "empatica_" + "{sensor}",
pid = "{pid}"
2020-12-16 00:19:11 +01:00
output:
"data/raw/{pid}/empatica_{sensor}_raw.csv"
2020-12-16 00:19:11 +01:00
script:
"../src/data/streams/pull_empatica_data.R"
2020-12-16 00:19:11 +01:00
2020-12-15 02:30:34 +01:00
rule empatica_readable_datetime:
input:
sensor_input = "data/raw/{pid}/empatica_{sensor}_raw.csv",
time_segments = "data/interim/time_segments/{pid}_time_segments.csv",
pid_file = "data/external/participant_files/{pid}.yaml",
tzcodes_file = input_tzcodes_file,
2020-12-15 02:30:34 +01:00
params:
device_type = "empatica",
timezone_parameters = config["TIMEZONE"],
pid = "{pid}",
2020-12-15 02:30:34 +01:00
time_segments_type = config["TIME_SEGMENTS"]["TYPE"],
include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
output:
"data/raw/{pid}/empatica_{sensor}_with_datetime.csv"
script:
"../src/data/datetime/readable_datetime.R"