Refactor snakefiles to smk files and create common.smk

2020-08-11 18:45:50 -04:00 · 2020-08-11 18:45:50 -04:00 · b83f8ead44
parent 1dec6764f8
commit b83f8ead44
9 changed files with 121 additions and 188 deletions
--- a/11
+++ b/11
@ -1,9 +1,10 @@
 configfile: "config.yaml"
-include: "rules/renv.snakefile"
-include: "rules/preprocessing.snakefile"
-include: "rules/features.snakefile"
-include: "rules/models.snakefile"
-include: "rules/reports.snakefile"
+include: "rules/common.smk"
+include: "rules/renv.smk"
+include: "rules/preprocessing.smk"
+include: "rules/features.smk"
+include: "rules/models.smk"
+include: "rules/reports.smk"

 import itertools

--- a/rules/common.smk
+++ b/rules/common.smk
@ -0,0 +1,113 @@
+# Common.smk ##########################################################################################################
+
+def infer_participant_platform(participant_file):
+    with open(participant_file, encoding="ISO-8859-1") as external_file:
+        external_file_content = external_file.readlines()
+    platforms = external_file_content[1].strip().split(",")
+    if platforms[0] == "multiple" or (len(platforms) > 1 and "android" in platforms and "ios" in platforms):
+        platform = "android"
+    else:
+        platform = platforms[0]
+
+    if platform not in ["android", "ios"]:
+        raise ValueError("Platform (line 2) in a participant file should be 'android', 'ios', or 'multiple'. You typed '" + platforms + "'")
+
+    return platform
+
+# Preprocessing.smk ####################################################################################################
+
+def optional_phone_sensed_bins_input(wildcards):
+    platform = infer_participant_platform("data/external/"+wildcards.pid)
+    
+    if platform == "android":
+        tables_platform = [table for table in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]] # for android, discard any ios tables that may exist
+    elif platform == "ios":
+        tables_platform = [table for table in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"]]] # for ios, discard any android tables that may exist
+
+    return expand("data/raw/{{pid}}/{table}_with_datetime.csv", table = tables_platform)
+
+# Features.smk #########################################################################################################
+
+def optional_ar_input(wildcards):
+    platform = infer_participant_platform("data/external/"+wildcards.pid)
+
+    if platform == "android": 
+        return ["data/raw/{pid}/" + config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"] + "_with_datetime_unified.csv",
+                "data/processed/{pid}/" + config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"] + "_deltas.csv"]
+    elif platform == "ios":
+        return ["data/raw/{pid}/"+config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]+"_with_datetime_unified.csv",
+                "data/processed/{pid}/"+config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]+"_deltas.csv"]
+
+def optional_conversation_input(wildcards):
+    platform = infer_participant_platform("data/external/"+wildcards.pid)
+
+    if platform == "android":
+        return ["data/raw/{pid}/" + config["CONVERSATION"]["DB_TABLE"]["ANDROID"] + "_with_datetime_unified.csv"]
+    elif platform == "ios":
+        return ["data/raw/{pid}/" + config["CONVERSATION"]["DB_TABLE"]["IOS"] + "_with_datetime_unified.csv"]
+
+def optional_location_barnett_input(wildcards):
+    if config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED":
+        return expand("data/raw/{{pid}}/{sensor}_resampled.csv", sensor=config["BARNETT_LOCATION"]["DB_TABLE"])
+    else:
+        return expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BARNETT_LOCATION"]["DB_TABLE"])
+
+def optional_location_doryab_input(wildcards):
+    if config["DORYAB_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED":
+        return expand("data/raw/{{pid}}/{sensor}_resampled.csv", sensor=config["DORYAB_LOCATION"]["DB_TABLE"])
+    else:
+        return expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["DORYAB_LOCATION"]["DB_TABLE"])
+
+def optional_steps_sleep_input(wildcards):
+    if config["STEP"]["EXCLUDE_SLEEP"]["EXCLUDE"] == True and config["STEP"]["EXCLUDE_SLEEP"]["TYPE"] == "FITBIT_BASED":
+        return  "data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv"
+    else:
+        return []
+
+def optional_wifi_input(wildcards):
+    if len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0 and len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) == 0:
+        return {"visible_access_points": expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"])}
+    elif len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) == 0 and len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) > 0:
+        return {"connected_access_points": expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"])}
+    elif len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0 and len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) > 0:
+        return {"visible_access_points": expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]), "connected_access_points": expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"])}
+    else:
+        raise ValueError("If you are computing WIFI features you need to provide either VISIBLE_ACCESS_POINTS, CONNECTED_ACCESS_POINTS or both")
+
+
+# Models.smk ###########################################################################################################
+
+def input_merge_features_of_single_participant(wildcards):
+    if wildcards.source == "phone_fitbit_features":
+        return expand("data/processed/{pid}/{features}_{day_segment}.csv", pid=wildcards.pid, features=config["PARAMS_FOR_ANALYSIS"]["PHONE_FEATURES"] + config["PARAMS_FOR_ANALYSIS"]["FITBIT_FEATURES"], day_segment=wildcards.day_segment)
+    else:
+        return expand("data/processed/{pid}/{features}_{day_segment}.csv", pid=wildcards.pid, features=config["PARAMS_FOR_ANALYSIS"][wildcards.source.upper()], day_segment=wildcards.day_segment)
+
+def optional_input_days_to_include(wildcards):
+    if config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["ENABLED"]:
+        # This input automatically trigers the rule days_to_analyse in mystudy.snakefile
+        return ["data/interim/{pid}/days_to_analyse" + \
+                    "_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_BEFORE_SURGERY"]) + \
+                    "_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_IN_HOSPITAL"]) + \
+                    "_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_AFTER_DISCHARGE"]) + ".csv"]
+    else:
+        return []
+
+def optional_input_valid_sensed_days(wildcards):
+    if config["PARAMS_FOR_ANALYSIS"]["DROP_VALID_SENSED_DAYS"]["ENABLED"]:
+        # This input automatically trigers the rule phone_valid_sensed_days in preprocessing.snakefile
+        return ["data/interim/{pid}/phone_valid_sensed_days_{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins.csv"]
+    else:
+        return []
+
+# Reports.smk ###########################################################################################################
+
+def optional_heatmap_days_by_sensors_input(wildcards):
+    platform = infer_participant_platform("data/external/"+wildcards.pid)
+    
+    if platform == "android":
+        tables_platform = [table for table in config["HEATMAP_DAYS_BY_SENSORS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]] # for android, discard any ios tables that may exist
+    elif platform == "ios":
+        tables_platform = [table for table in config["HEATMAP_DAYS_BY_SENSORS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"]]] # for ios, discard any android tables that may exist
+
+    return expand("data/raw/{{pid}}/{table}_with_datetime.csv", table = tables_platform)
--- a/rules/features.snakefile
+++ b/rules/features.snakefile
@ -1,63 +1,3 @@
-def infer_participant_platform(participant_file):
-    with open(participant_file, encoding="ISO-8859-1") as external_file:
-        external_file_content = external_file.readlines()
-    platforms = external_file_content[1].strip().split(",")
-    if platforms[0] == "multiple" or (len(platforms) > 1 and "android" in platforms and "ios" in platforms):
-        platform = "android"
-    else:
-        platform = platforms[0]
-    return platform
-
-def optional_ar_input(wildcards):
-    platform = infer_participant_platform("data/external/"+wildcards.pid)
-
-    if platform == "android": 
-        return ["data/raw/{pid}/" + config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"] + "_with_datetime_unified.csv",
-                "data/processed/{pid}/" + config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"] + "_deltas.csv"]
-    elif platform == "ios":
-        return ["data/raw/{pid}/"+config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]+"_with_datetime_unified.csv",
-                "data/processed/{pid}/"+config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]+"_deltas.csv"]
-    else:
-        raise ValueError("Platform (line 2) in a participant file should be 'android', 'ios', or 'multiple'. You typed '" + platforms + "'")
-
-def optional_conversation_input(wildcards):
-    platform = infer_participant_platform("data/external/"+wildcards.pid)
-
-    if platform == "android":
-        return ["data/raw/{pid}/" + config["CONVERSATION"]["DB_TABLE"]["ANDROID"] + "_with_datetime_unified.csv"]
-    elif platform == "ios":
-        return ["data/raw/{pid}/" + config["CONVERSATION"]["DB_TABLE"]["IOS"] + "_with_datetime_unified.csv"]
-    else:
-        raise ValueError("Platform (line 2) in a participant file should be 'android' or 'ios', or 'multiple'. You typed '" + platforms + "'")
-
-def optional_location_input(wildcards):
-    if config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED":
-        return expand("data/raw/{{pid}}/{sensor}_resampled.csv", sensor=config["BARNETT_LOCATION"]["DB_TABLE"])
-    else:
-        return expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BARNETT_LOCATION"]["DB_TABLE"])
-
-def optional_location_doryab_input(wildcards):
-    if config["DORYAB_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED":
-        return expand("data/raw/{{pid}}/{sensor}_resampled.csv", sensor=config["DORYAB_LOCATION"]["DB_TABLE"])
-    else:
-        return expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["DORYAB_LOCATION"]["DB_TABLE"])
-
-def optional_steps_sleep_input(wildcards):
-    if config["STEP"]["EXCLUDE_SLEEP"]["EXCLUDE"] == True and config["STEP"]["EXCLUDE_SLEEP"]["TYPE"] == "FITBIT_BASED":
-        return  "data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv"
-    else:
-        return []
-
-def optional_wifi_input(wildcards):
-    if len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0 and len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) == 0:
-        return {"visible_access_points": expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"])}
-    elif len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) == 0 and len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) > 0:
-        return {"connected_access_points": expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"])}
-    elif len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0 and len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) > 0:
-        return {"visible_access_points": expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]), "connected_access_points": expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"])}
-    else:
-        raise ValueError("If you are computing WIFI features you need to provide either VISIBLE_ACCESS_POINTS, CONNECTED_ACCESS_POINTS or both")
-
 rule messages_features:
    input: 
        expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["MESSAGES"]["DB_TABLE"])
@ -116,7 +56,7 @@ rule ios_activity_recognition_deltas:

 rule location_barnett_features:
    input:
-        locations = optional_location_input
+        locations = optional_location_barnett_input
    params:
        features = config["BARNETT_LOCATION"]["FEATURES"],
        locations_to_use = config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"],
--- a/rules/models.snakefile
+++ b/rules/models.snakefile
@ -36,29 +36,6 @@ rule demographic_features:
    script:
        "../src/features/demographic_features.py"

-def input_merge_features_of_single_participant(wildcards):
-    if wildcards.source == "phone_fitbit_features":
-        return expand("data/processed/{pid}/{features}_{day_segment}.csv", pid=wildcards.pid, features=config["PARAMS_FOR_ANALYSIS"]["PHONE_FEATURES"] + config["PARAMS_FOR_ANALYSIS"]["FITBIT_FEATURES"], day_segment=wildcards.day_segment)
-    else:
-        return expand("data/processed/{pid}/{features}_{day_segment}.csv", pid=wildcards.pid, features=config["PARAMS_FOR_ANALYSIS"][wildcards.source.upper()], day_segment=wildcards.day_segment)
-
-def optional_input_days_to_include(wildcards):
-    if config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["ENABLED"]:
-        # This input automatically trigers the rule days_to_analyse in mystudy.snakefile
-        return ["data/interim/{pid}/days_to_analyse" + \
-                    "_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_BEFORE_SURGERY"]) + \
-                    "_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_IN_HOSPITAL"]) + \
-                    "_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_AFTER_DISCHARGE"]) + ".csv"]
-    else:
-        return []
-
-def optional_input_valid_sensed_days(wildcards):
-    if config["PARAMS_FOR_ANALYSIS"]["DROP_VALID_SENSED_DAYS"]["ENABLED"]:
-        # This input automatically trigers the rule phone_valid_sensed_days in preprocessing.snakefile
-        return ["data/interim/{pid}/phone_valid_sensed_days_{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins.csv"]
-    else:
-        return []
-
 rule merge_features_for_individual_model:
    input:
        feature_files = input_merge_features_of_single_participant,
--- a/rules/packrat.snakefile
+++ b/rules/packrat.snakefile
@ -1,22 +0,0 @@
-# --- Packrat Rules --- #
-## Taken from https://github.com/lachlandeer/snakemake-econ-r
-
-## packrat_install: installs packrat onto machine
-rule packrat_install:
-    shell:
-        "R -e 'install.packages(\"packrat\", repos=\"http://cran.us.r-project.org\")'"
-
-## packrat_install: initialize a packrat environment for this project
-rule packrat_init:
-    shell:
-        "R -e 'packrat::init()'"
-
-## packrat_snap   : Look for new R packages in files & archives them
-rule packrat_snap:
-    shell:
-        "R -e 'packrat::snapshot()'"
-
-## packrat_restore: Installs archived packages onto a new machine
-rule packrat_restore:
-    shell:
-        "R -e 'packrat::restore()'"
--- a/rules/preprocessing.snakefile
+++ b/rules/preprocessing.snakefile
@ -1,22 +1,3 @@
-def optional_phone_sensed_bins_input(wildcards):
-    with open("data/external/"+wildcards.pid, encoding="ISO-8859-1") as external_file:
-        external_file_content = external_file.readlines()
-    platforms = external_file_content[1].strip().split(",")
-    if platforms[0] == "multiple" or (len(platforms) > 1 and "android" in platforms and "ios" in platforms):
-        platform = "android"
-    else:
-        platform = platforms[0]
-    
-    if platform not in ["android", "ios"]:
-        raise ValueError("Platform (line 2) in a participant file should be 'android', 'ios', or 'multiple'. You typed '" + platforms + "'")
-    
-    if platform == "android":
-        tables_platform = [table for table in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]] # for android, discard any ios tables that may exist
-    elif platform == "ios":
-        tables_platform = [table for table in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"]]] # for ios, discard any android tables that may exist
-
-    return expand("data/raw/{{pid}}/{table}_with_datetime.csv", table = tables_platform)
-
 rule restore_sql_file:
    input:
        sql_file = "data/external/rapids_example.sql",
--- a/rules/renv.snakefile
+++ b/rules/renv.snakefile
--- a/rules/reports.snakefile
+++ b/rules/reports.snakefile
@ -1,27 +1,3 @@
-def optional_heatmap_days_by_sensors_input(wildcards):
-    with open("data/external/"+wildcards.pid, encoding="ISO-8859-1") as external_file:
-        external_file_content = external_file.readlines()
-    platforms = external_file_content[1].strip().split(",")
-    if platforms[0] == "multiple" or (len(platforms) > 1 and "android" in platforms and "ios" in platforms):
-        platform = "android"
-    else:
-        platform = platforms[0]
-    
-    if platform not in ["android", "ios"]:
-        raise ValueError("Platform (line 2) in a participant file should be 'android', 'ios', or 'multiple'. You typed '" + platforms + "'")
-
-    input_for_heatmap_days_by_sensors, tables = [], config["HEATMAP_DAYS_BY_SENSORS"]["DB_TABLES"]
-
-    for sensor in ["ACTIVITY_RECOGNITION", "CONVERSATION"]:
-        table = config[sensor]["DB_TABLE"][platform.upper()]
-        if table in tables:
-            input_for_heatmap_days_by_sensors.append("data/raw/{pid}/" + table + "_with_datetime.csv")
-            tables = [x for x in tables if x not in config[sensor]["DB_TABLE"].values()]
-    for table in tables:
-        input_for_heatmap_days_by_sensors.append("data/raw/{pid}/" + table + "_with_datetime.csv")
-
-    return input_for_heatmap_days_by_sensors
-
 rule heatmap_features_correlations:
    input:
        features = expand("data/processed/{pid}/{sensor}_{day_segment}.csv", pid=config["PIDS"], sensor=config["HEATMAP_FEATURES_CORRELATIONS"]["PHONE_FEATURES"]+config["HEATMAP_FEATURES_CORRELATIONS"]["FITBIT_FEATURES"], day_segment=config["DAY_SEGMENTS"]),
@ -98,36 +74,3 @@ rule overall_compliance_heatmap:
        "reports/data_exploration/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/overall_compliance_heatmap.html"
    script:
        "../src/visualization/overall_compliance_heatmap.py"
-
-# rule heatmap_rows:
-#     input:
-#         sensor = "data/raw/{pid}/{sensor}_with_datetime.csv",
-#         pid_file = "data/external/{pid}"
-#     params:
-#         table = "{sensor}",
-#         pid = "{pid}",
-#         bin_size = config["PHONE_VALID_SENSED_BINS"]["BIN_SIZE"]
-#     output:
-#         "reports/figures/{pid}/{sensor}_heatmap_rows.html"
-#     script:
-#         "../src/visualization/heatmap_rows.py"
-
-# rule battery_consumption_rates_barchart:
-#     input:
-#         sensor = "data/processed/{pid}/battery_daily.csv",
-#         pid_file = "data/external/{pid}"
-#     params:
-#         pid = "{pid}"
-#     output:
-#         "reports/figures/{pid}/battery_consumption_rates_barchart.html"
-#     script:
-#         "../src/visualization/battery_consumption_rates_barchart.py"
-
-# rule compliance_report:
-#     input:
-#         sensor_heatmaps =  expand("reports/figures/{{pid}}/{sensor}_heatmap_rows.html", sensor=PHONE_SENSORS),
-#         compliance_heatmap =  rules.compliance_heatmap.output
-#     output:
-#         "reports/compliance/{pid}/compliance_report.html",
-#     script:
-#         "../src/visualization/compliance_report.Rmd"
--- a/tests/scripts/run_tests.sh
+++ b/tests/scripts/run_tests.sh
@ -19,7 +19,7 @@ cp tests/data/external/* data/external
 # cp rules/preprocessing.snakefile bak

 echo Disabling downloading of dataset...
-sed -e '46,58 s/^/#/' -e  's/rules.download_dataset.output/"data\/raw\/\{pid\}\/\{sensor\}_raw\.csv"/' rules/preprocessing.snakefile > tmp
+sed -e '27,39 s/^/#/' -e  's/rules.download_dataset.output/"data\/raw\/\{pid\}\/\{sensor\}_raw\.csv"/' rules/preprocessing.snakefile > tmp
 cp tmp rules/preprocessing.snakefile

 echo Running RAPIDS Pipeline on testdata...