Refactor snakefiles to smk files and create common.smk

pull/95/head
JulioV 2020-08-11 18:45:50 -04:00
parent 1dec6764f8
commit b83f8ead44
9 changed files with 121 additions and 188 deletions

View File

@ -1,9 +1,10 @@
configfile: "config.yaml"
include: "rules/renv.snakefile"
include: "rules/preprocessing.snakefile"
include: "rules/features.snakefile"
include: "rules/models.snakefile"
include: "rules/reports.snakefile"
include: "rules/common.smk"
include: "rules/renv.smk"
include: "rules/preprocessing.smk"
include: "rules/features.smk"
include: "rules/models.smk"
include: "rules/reports.smk"
import itertools

113
rules/common.smk 100644
View File

@ -0,0 +1,113 @@
# Common.smk ##########################################################################################################
def infer_participant_platform(participant_file):
with open(participant_file, encoding="ISO-8859-1") as external_file:
external_file_content = external_file.readlines()
platforms = external_file_content[1].strip().split(",")
if platforms[0] == "multiple" or (len(platforms) > 1 and "android" in platforms and "ios" in platforms):
platform = "android"
else:
platform = platforms[0]
if platform not in ["android", "ios"]:
raise ValueError("Platform (line 2) in a participant file should be 'android', 'ios', or 'multiple'. You typed '" + platforms + "'")
return platform
# Preprocessing.smk ####################################################################################################
def optional_phone_sensed_bins_input(wildcards):
platform = infer_participant_platform("data/external/"+wildcards.pid)
if platform == "android":
tables_platform = [table for table in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]] # for android, discard any ios tables that may exist
elif platform == "ios":
tables_platform = [table for table in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"]]] # for ios, discard any android tables that may exist
return expand("data/raw/{{pid}}/{table}_with_datetime.csv", table = tables_platform)
# Features.smk #########################################################################################################
def optional_ar_input(wildcards):
platform = infer_participant_platform("data/external/"+wildcards.pid)
if platform == "android":
return ["data/raw/{pid}/" + config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"] + "_with_datetime_unified.csv",
"data/processed/{pid}/" + config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"] + "_deltas.csv"]
elif platform == "ios":
return ["data/raw/{pid}/"+config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]+"_with_datetime_unified.csv",
"data/processed/{pid}/"+config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]+"_deltas.csv"]
def optional_conversation_input(wildcards):
platform = infer_participant_platform("data/external/"+wildcards.pid)
if platform == "android":
return ["data/raw/{pid}/" + config["CONVERSATION"]["DB_TABLE"]["ANDROID"] + "_with_datetime_unified.csv"]
elif platform == "ios":
return ["data/raw/{pid}/" + config["CONVERSATION"]["DB_TABLE"]["IOS"] + "_with_datetime_unified.csv"]
def optional_location_barnett_input(wildcards):
if config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED":
return expand("data/raw/{{pid}}/{sensor}_resampled.csv", sensor=config["BARNETT_LOCATION"]["DB_TABLE"])
else:
return expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BARNETT_LOCATION"]["DB_TABLE"])
def optional_location_doryab_input(wildcards):
if config["DORYAB_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED":
return expand("data/raw/{{pid}}/{sensor}_resampled.csv", sensor=config["DORYAB_LOCATION"]["DB_TABLE"])
else:
return expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["DORYAB_LOCATION"]["DB_TABLE"])
def optional_steps_sleep_input(wildcards):
if config["STEP"]["EXCLUDE_SLEEP"]["EXCLUDE"] == True and config["STEP"]["EXCLUDE_SLEEP"]["TYPE"] == "FITBIT_BASED":
return "data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv"
else:
return []
def optional_wifi_input(wildcards):
if len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0 and len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) == 0:
return {"visible_access_points": expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"])}
elif len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) == 0 and len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) > 0:
return {"connected_access_points": expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"])}
elif len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0 and len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) > 0:
return {"visible_access_points": expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]), "connected_access_points": expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"])}
else:
raise ValueError("If you are computing WIFI features you need to provide either VISIBLE_ACCESS_POINTS, CONNECTED_ACCESS_POINTS or both")
# Models.smk ###########################################################################################################
def input_merge_features_of_single_participant(wildcards):
if wildcards.source == "phone_fitbit_features":
return expand("data/processed/{pid}/{features}_{day_segment}.csv", pid=wildcards.pid, features=config["PARAMS_FOR_ANALYSIS"]["PHONE_FEATURES"] + config["PARAMS_FOR_ANALYSIS"]["FITBIT_FEATURES"], day_segment=wildcards.day_segment)
else:
return expand("data/processed/{pid}/{features}_{day_segment}.csv", pid=wildcards.pid, features=config["PARAMS_FOR_ANALYSIS"][wildcards.source.upper()], day_segment=wildcards.day_segment)
def optional_input_days_to_include(wildcards):
if config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["ENABLED"]:
# This input automatically trigers the rule days_to_analyse in mystudy.snakefile
return ["data/interim/{pid}/days_to_analyse" + \
"_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_BEFORE_SURGERY"]) + \
"_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_IN_HOSPITAL"]) + \
"_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_AFTER_DISCHARGE"]) + ".csv"]
else:
return []
def optional_input_valid_sensed_days(wildcards):
if config["PARAMS_FOR_ANALYSIS"]["DROP_VALID_SENSED_DAYS"]["ENABLED"]:
# This input automatically trigers the rule phone_valid_sensed_days in preprocessing.snakefile
return ["data/interim/{pid}/phone_valid_sensed_days_{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins.csv"]
else:
return []
# Reports.smk ###########################################################################################################
def optional_heatmap_days_by_sensors_input(wildcards):
platform = infer_participant_platform("data/external/"+wildcards.pid)
if platform == "android":
tables_platform = [table for table in config["HEATMAP_DAYS_BY_SENSORS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]] # for android, discard any ios tables that may exist
elif platform == "ios":
tables_platform = [table for table in config["HEATMAP_DAYS_BY_SENSORS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"]]] # for ios, discard any android tables that may exist
return expand("data/raw/{{pid}}/{table}_with_datetime.csv", table = tables_platform)

View File

@ -1,63 +1,3 @@
def infer_participant_platform(participant_file):
with open(participant_file, encoding="ISO-8859-1") as external_file:
external_file_content = external_file.readlines()
platforms = external_file_content[1].strip().split(",")
if platforms[0] == "multiple" or (len(platforms) > 1 and "android" in platforms and "ios" in platforms):
platform = "android"
else:
platform = platforms[0]
return platform
def optional_ar_input(wildcards):
platform = infer_participant_platform("data/external/"+wildcards.pid)
if platform == "android":
return ["data/raw/{pid}/" + config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"] + "_with_datetime_unified.csv",
"data/processed/{pid}/" + config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"] + "_deltas.csv"]
elif platform == "ios":
return ["data/raw/{pid}/"+config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]+"_with_datetime_unified.csv",
"data/processed/{pid}/"+config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]+"_deltas.csv"]
else:
raise ValueError("Platform (line 2) in a participant file should be 'android', 'ios', or 'multiple'. You typed '" + platforms + "'")
def optional_conversation_input(wildcards):
platform = infer_participant_platform("data/external/"+wildcards.pid)
if platform == "android":
return ["data/raw/{pid}/" + config["CONVERSATION"]["DB_TABLE"]["ANDROID"] + "_with_datetime_unified.csv"]
elif platform == "ios":
return ["data/raw/{pid}/" + config["CONVERSATION"]["DB_TABLE"]["IOS"] + "_with_datetime_unified.csv"]
else:
raise ValueError("Platform (line 2) in a participant file should be 'android' or 'ios', or 'multiple'. You typed '" + platforms + "'")
def optional_location_input(wildcards):
if config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED":
return expand("data/raw/{{pid}}/{sensor}_resampled.csv", sensor=config["BARNETT_LOCATION"]["DB_TABLE"])
else:
return expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BARNETT_LOCATION"]["DB_TABLE"])
def optional_location_doryab_input(wildcards):
if config["DORYAB_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED":
return expand("data/raw/{{pid}}/{sensor}_resampled.csv", sensor=config["DORYAB_LOCATION"]["DB_TABLE"])
else:
return expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["DORYAB_LOCATION"]["DB_TABLE"])
def optional_steps_sleep_input(wildcards):
if config["STEP"]["EXCLUDE_SLEEP"]["EXCLUDE"] == True and config["STEP"]["EXCLUDE_SLEEP"]["TYPE"] == "FITBIT_BASED":
return "data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv"
else:
return []
def optional_wifi_input(wildcards):
if len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0 and len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) == 0:
return {"visible_access_points": expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"])}
elif len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) == 0 and len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) > 0:
return {"connected_access_points": expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"])}
elif len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0 and len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) > 0:
return {"visible_access_points": expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]), "connected_access_points": expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"])}
else:
raise ValueError("If you are computing WIFI features you need to provide either VISIBLE_ACCESS_POINTS, CONNECTED_ACCESS_POINTS or both")
rule messages_features:
input:
expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["MESSAGES"]["DB_TABLE"])
@ -116,7 +56,7 @@ rule ios_activity_recognition_deltas:
rule location_barnett_features:
input:
locations = optional_location_input
locations = optional_location_barnett_input
params:
features = config["BARNETT_LOCATION"]["FEATURES"],
locations_to_use = config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"],

View File

@ -36,29 +36,6 @@ rule demographic_features:
script:
"../src/features/demographic_features.py"
def input_merge_features_of_single_participant(wildcards):
if wildcards.source == "phone_fitbit_features":
return expand("data/processed/{pid}/{features}_{day_segment}.csv", pid=wildcards.pid, features=config["PARAMS_FOR_ANALYSIS"]["PHONE_FEATURES"] + config["PARAMS_FOR_ANALYSIS"]["FITBIT_FEATURES"], day_segment=wildcards.day_segment)
else:
return expand("data/processed/{pid}/{features}_{day_segment}.csv", pid=wildcards.pid, features=config["PARAMS_FOR_ANALYSIS"][wildcards.source.upper()], day_segment=wildcards.day_segment)
def optional_input_days_to_include(wildcards):
if config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["ENABLED"]:
# This input automatically trigers the rule days_to_analyse in mystudy.snakefile
return ["data/interim/{pid}/days_to_analyse" + \
"_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_BEFORE_SURGERY"]) + \
"_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_IN_HOSPITAL"]) + \
"_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_AFTER_DISCHARGE"]) + ".csv"]
else:
return []
def optional_input_valid_sensed_days(wildcards):
if config["PARAMS_FOR_ANALYSIS"]["DROP_VALID_SENSED_DAYS"]["ENABLED"]:
# This input automatically trigers the rule phone_valid_sensed_days in preprocessing.snakefile
return ["data/interim/{pid}/phone_valid_sensed_days_{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins.csv"]
else:
return []
rule merge_features_for_individual_model:
input:
feature_files = input_merge_features_of_single_participant,

View File

@ -1,22 +0,0 @@
# --- Packrat Rules --- #
## Taken from https://github.com/lachlandeer/snakemake-econ-r
## packrat_install: installs packrat onto machine
rule packrat_install:
shell:
"R -e 'install.packages(\"packrat\", repos=\"http://cran.us.r-project.org\")'"
## packrat_install: initialize a packrat environment for this project
rule packrat_init:
shell:
"R -e 'packrat::init()'"
## packrat_snap : Look for new R packages in files & archives them
rule packrat_snap:
shell:
"R -e 'packrat::snapshot()'"
## packrat_restore: Installs archived packages onto a new machine
rule packrat_restore:
shell:
"R -e 'packrat::restore()'"

View File

@ -1,22 +1,3 @@
def optional_phone_sensed_bins_input(wildcards):
with open("data/external/"+wildcards.pid, encoding="ISO-8859-1") as external_file:
external_file_content = external_file.readlines()
platforms = external_file_content[1].strip().split(",")
if platforms[0] == "multiple" or (len(platforms) > 1 and "android" in platforms and "ios" in platforms):
platform = "android"
else:
platform = platforms[0]
if platform not in ["android", "ios"]:
raise ValueError("Platform (line 2) in a participant file should be 'android', 'ios', or 'multiple'. You typed '" + platforms + "'")
if platform == "android":
tables_platform = [table for table in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]] # for android, discard any ios tables that may exist
elif platform == "ios":
tables_platform = [table for table in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"]]] # for ios, discard any android tables that may exist
return expand("data/raw/{{pid}}/{table}_with_datetime.csv", table = tables_platform)
rule restore_sql_file:
input:
sql_file = "data/external/rapids_example.sql",

View File

@ -1,27 +1,3 @@
def optional_heatmap_days_by_sensors_input(wildcards):
with open("data/external/"+wildcards.pid, encoding="ISO-8859-1") as external_file:
external_file_content = external_file.readlines()
platforms = external_file_content[1].strip().split(",")
if platforms[0] == "multiple" or (len(platforms) > 1 and "android" in platforms and "ios" in platforms):
platform = "android"
else:
platform = platforms[0]
if platform not in ["android", "ios"]:
raise ValueError("Platform (line 2) in a participant file should be 'android', 'ios', or 'multiple'. You typed '" + platforms + "'")
input_for_heatmap_days_by_sensors, tables = [], config["HEATMAP_DAYS_BY_SENSORS"]["DB_TABLES"]
for sensor in ["ACTIVITY_RECOGNITION", "CONVERSATION"]:
table = config[sensor]["DB_TABLE"][platform.upper()]
if table in tables:
input_for_heatmap_days_by_sensors.append("data/raw/{pid}/" + table + "_with_datetime.csv")
tables = [x for x in tables if x not in config[sensor]["DB_TABLE"].values()]
for table in tables:
input_for_heatmap_days_by_sensors.append("data/raw/{pid}/" + table + "_with_datetime.csv")
return input_for_heatmap_days_by_sensors
rule heatmap_features_correlations:
input:
features = expand("data/processed/{pid}/{sensor}_{day_segment}.csv", pid=config["PIDS"], sensor=config["HEATMAP_FEATURES_CORRELATIONS"]["PHONE_FEATURES"]+config["HEATMAP_FEATURES_CORRELATIONS"]["FITBIT_FEATURES"], day_segment=config["DAY_SEGMENTS"]),
@ -98,36 +74,3 @@ rule overall_compliance_heatmap:
"reports/data_exploration/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/overall_compliance_heatmap.html"
script:
"../src/visualization/overall_compliance_heatmap.py"
# rule heatmap_rows:
# input:
# sensor = "data/raw/{pid}/{sensor}_with_datetime.csv",
# pid_file = "data/external/{pid}"
# params:
# table = "{sensor}",
# pid = "{pid}",
# bin_size = config["PHONE_VALID_SENSED_BINS"]["BIN_SIZE"]
# output:
# "reports/figures/{pid}/{sensor}_heatmap_rows.html"
# script:
# "../src/visualization/heatmap_rows.py"
# rule battery_consumption_rates_barchart:
# input:
# sensor = "data/processed/{pid}/battery_daily.csv",
# pid_file = "data/external/{pid}"
# params:
# pid = "{pid}"
# output:
# "reports/figures/{pid}/battery_consumption_rates_barchart.html"
# script:
# "../src/visualization/battery_consumption_rates_barchart.py"
# rule compliance_report:
# input:
# sensor_heatmaps = expand("reports/figures/{{pid}}/{sensor}_heatmap_rows.html", sensor=PHONE_SENSORS),
# compliance_heatmap = rules.compliance_heatmap.output
# output:
# "reports/compliance/{pid}/compliance_report.html",
# script:
# "../src/visualization/compliance_report.Rmd"

View File

@ -19,7 +19,7 @@ cp tests/data/external/* data/external
# cp rules/preprocessing.snakefile bak
echo Disabling downloading of dataset...
sed -e '46,58 s/^/#/' -e 's/rules.download_dataset.output/"data\/raw\/\{pid\}\/\{sensor\}_raw\.csv"/' rules/preprocessing.snakefile > tmp
sed -e '27,39 s/^/#/' -e 's/rules.download_dataset.output/"data\/raw\/\{pid\}\/\{sensor\}_raw\.csv"/' rules/preprocessing.snakefile > tmp
cp tmp rules/preprocessing.snakefile
echo Running RAPIDS Pipeline on testdata...