diff --git a/.gitignore b/.gitignore index eb1b7009..e7ae40b8 100644 --- a/.gitignore +++ b/.gitignore @@ -114,10 +114,14 @@ sn_profile_*/ settings.dcf tests/fakedata_generation/ site/ -!credentials.yaml +credentials.yaml # Docker container and other files .devcontainer # Calculating features module calculatingfeatures/ + + +# Temp folder for rapids data/external +rapids_temp_data/ diff --git a/Snakefile b/Snakefile index 4241aa19..4455e333 100644 --- a/Snakefile +++ b/Snakefile @@ -5,6 +5,7 @@ include: "rules/common.smk" include: "rules/renv.smk" include: "rules/preprocessing.smk" include: "rules/features.smk" +include: "rules/models.smk" include: "rules/reports.smk" import itertools diff --git a/config.yaml b/config.yaml index e3498510..3641c236 100644 --- a/config.yaml +++ b/config.yaml @@ -3,16 +3,17 @@ ######################################################################################################################## # See https://www.rapids.science/latest/setup/configuration/#participant-files -PIDS: [p031] #p01, p02, p03] +PIDS: ['p031', 'p032', 'p033', 'p034', 'p035', 'p036', 'p037', 'p038', 'p039', 'p040', 'p042', 'p043', 'p044', 'p045', 'p046', 'p049', 'p050', 'p052', 'p053', 'p054', 'p055', 'p057', 'p058', 'p059', 'p060', 'p061', 'p062', 'p064', 'p067', 'p068', 'p069', 'p070', 'p071', 'p072', 'p073', 'p074', 'p075', 'p076', 'p077', 'p078', 'p079', 'p080', 'p081', 'p082', 'p083', 'p084', 'p085', 'p086', 'p088', 'p089', 'p090', 'p091', 'p092', 'p093', 'p106', 'p107'] # See https://www.rapids.science/latest/setup/configuration/#automatic-creation-of-participant-files CREATE_PARTICIPANT_FILES: - CSV_FILE_PATH: "data/external/example_participants.csv" # see docs for required format + USERNAMES_CSV: "data/external/main_study_usernames.csv" + CSV_FILE_PATH: "data/external/main_study_participants.csv" # see docs for required format PHONE_SECTION: ADD: True IGNORED_DEVICE_IDS: [] FITBIT_SECTION: - ADD: True + ADD: False IGNORED_DEVICE_IDS: [] EMPATICA_SECTION: ADD: True @@ -21,16 +22,17 @@ CREATE_PARTICIPANT_FILES: # See https://www.rapids.science/latest/setup/configuration/#time-segments TIME_SEGMENTS: &time_segments TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT - FILE: "data/external/timesegments_periodic.csv" - INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, see docs + FILE: "data/external/timesegments_daily.csv" + INCLUDE_PAST_PERIODIC_SEGMENTS: TRUE # Only relevant if TYPE=PERIODIC, see docs # See https://www.rapids.science/latest/setup/configuration/#timezone-of-your-study TIMEZONE: - TYPE: SINGLE + TYPE: MULTIPLE SINGLE: TZCODE: Europe/Ljubljana MULTIPLE: - TZCODES_FILE: data/external/multiple_timezones_example.csv + TZ_FILE: data/external/timezone.csv + TZCODES_FILE: data/external/multiple_timezones.csv IF_MISSING_TZCODE: USE_DEFAULT DEFAULT_TZCODE: Europe/Ljubljana FITBIT: @@ -85,7 +87,7 @@ PHONE_ACTIVITY_RECOGNITION: EPISODE_THRESHOLD_BETWEEN_ROWS: 5 # minutes. Max time difference for two consecutive rows to be considered within the same AR episode. PROVIDERS: RAPIDS: - COMPUTE: False + COMPUTE: True FEATURES: ["count", "mostcommonactivity", "countuniqueactivities", "durationstationary", "durationmobile", "durationvehicle"] ACTIVITY_CLASSES: STATIONARY: ["still", "tilting"] @@ -114,7 +116,7 @@ PHONE_APPLICATIONS_FOREGROUND: SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway PROVIDERS: RAPIDS: - COMPUTE: False + COMPUTE: True INCLUDE_EPISODE_FEATURES: True SINGLE_CATEGORIES: ["all", "email"] MULTIPLE_CATEGORIES: @@ -149,7 +151,7 @@ PHONE_BATTERY: EPISODE_THRESHOLD_BETWEEN_ROWS: 30 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode. PROVIDERS: RAPIDS: - COMPUTE: False + COMPUTE: True FEATURES: ["countdischarge", "sumdurationdischarge", "countcharge", "sumdurationcharge", "avgconsumptionrate", "maxconsumptionrate"] SRC_SCRIPT: src/features/phone_battery/rapids/main.py @@ -158,12 +160,12 @@ PHONE_BLUETOOTH: CONTAINER: bluetooth PROVIDERS: RAPIDS: - COMPUTE: False + COMPUTE: True FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] SRC_SCRIPT: src/features/phone_bluetooth/rapids/main.R DORYAB: - COMPUTE: False + COMPUTE: True FEATURES: ALL: DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"] @@ -184,7 +186,7 @@ PHONE_CALLS: CONTAINER: call PROVIDERS: RAPIDS: - COMPUTE: False + COMPUTE: True FEATURES_TYPE: EPISODES # EVENTS or EPISODES CALL_TYPES: [missed, incoming, outgoing] FEATURES: @@ -227,7 +229,7 @@ PHONE_DATA_YIELD: PHONE_WIFI_VISIBLE] PROVIDERS: RAPIDS: - COMPUTE: False + COMPUTE: True FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours] MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1, minimum percentage of valid minutes in an hour to be considered valid. SRC_SCRIPT: src/features/phone_data_yield/rapids/main.R @@ -255,7 +257,7 @@ PHONE_LIGHT: CONTAINER: light_sensor PROVIDERS: RAPIDS: - COMPUTE: False + COMPUTE: True FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"] SRC_SCRIPT: src/features/phone_light/rapids/main.py @@ -269,7 +271,7 @@ PHONE_LOCATIONS: PROVIDERS: DORYAB: - COMPUTE: False + COMPUTE: True FEATURES: ["locationvariance","loglocationvariance","totaldistance","avgspeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","avglengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome", "homelabel"] DBSCAN_EPS: 100 # meters DBSCAN_MINSAMPLES: 5 @@ -284,7 +286,7 @@ PHONE_LOCATIONS: SRC_SCRIPT: src/features/phone_locations/doryab/main.py BARNETT: - COMPUTE: False + COMPUTE: True FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"] IF_MULTIPLE_TIMEZONES: USE_MOST_COMMON MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features @@ -302,7 +304,7 @@ PHONE_MESSAGES: CONTAINER: sms PROVIDERS: RAPIDS: - COMPUTE: False + COMPUTE: True MESSAGES_TYPES : [received, sent] FEATURES: received: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact] @@ -314,7 +316,7 @@ PHONE_SCREEN: CONTAINER: screen PROVIDERS: RAPIDS: - COMPUTE: False + COMPUTE: True REFERENCE_HOUR_FIRST_USE: 0 IGNORE_EPISODES_SHORTER_THAN: 0 # in minutes, set to 0 to disable IGNORE_EPISODES_LONGER_THAN: 360 # in minutes, set to 0 to disable @@ -336,13 +338,12 @@ PHONE_WIFI_VISIBLE: CONTAINER: wifi PROVIDERS: RAPIDS: - COMPUTE: False + COMPUTE: True FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] SRC_SCRIPT: src/features/phone_wifi_visible/rapids/main.R - ######################################################################################################################## # FITBIT # ######################################################################################################################## @@ -484,6 +485,7 @@ FITBIT_STEPS_INTRADAY: INCLUDE_ZERO_STEP_ROWS: False SRC_SCRIPT: src/features/fitbit_steps_intraday/rapids/main.py + ######################################################################################################################## # EMPATICA # ######################################################################################################################## @@ -506,7 +508,7 @@ EMPATICA_ACCELEROMETER: FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"] SRC_SCRIPT: src/features/empatica_accelerometer/dbdp/main.py CR: - COMPUTE: False + COMPUTE: True FEATURES: ["totalMagnitudeBand", "absoluteMeanBand", "varianceBand"] # Acc features WINDOWS: COMPUTE: True @@ -534,7 +536,7 @@ EMPATICA_TEMPERATURE: FEATURES: ["maxtemp", "mintemp", "avgtemp", "mediantemp", "modetemp", "stdtemp", "diffmaxmodetemp", "diffminmodetemp", "entropytemp"] SRC_SCRIPT: src/features/empatica_temperature/dbdp/main.py CR: - COMPUTE: False + COMPUTE: True FEATURES: ["maximum", "minimum", "meanAbsChange", "longestStrikeAboveMean", "longestStrikeBelowMean", "stdDev", "median", "meanChange", "sumSquared", "squareSumOfComponent", "sumOfSquareComponents"] WINDOWS: @@ -595,7 +597,7 @@ EMPATICA_INTER_BEAT_INTERVAL: FEATURES: ["maxibi", "minibi", "avgibi", "medianibi", "modeibi", "stdibi", "diffmaxmodeibi", "diffminmodeibi", "entropyibi"] SRC_SCRIPT: src/features/empatica_inter_beat_interval/dbdp/main.py CR: - COMPUTE: False + COMPUTE: True FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features 'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features PATCH_WITH_BVP: True @@ -612,7 +614,6 @@ EMPATICA_TAGS: PROVIDERS: # None implemented yet - ######################################################################################################################## # PLOTS # ######################################################################################################################## @@ -654,17 +655,17 @@ HEATMAP_FEATURE_CORRELATION_MATRIX: ALL_CLEANING_INDIVIDUAL: PROVIDERS: RAPIDS: - COMPUTE: False + COMPUTE: True IMPUTE_SELECTED_EVENT_FEATURES: COMPUTE: True MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33 COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable COLS_VAR_THRESHOLD: True - ROWS_NAN_THRESHOLD: 0.3 # set to 1 to disable + ROWS_NAN_THRESHOLD: 1 # set to 1 to disable DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES - DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable + DATA_YIELD_RATIO_THRESHOLD: 0.3 # set to 0 to disable DROP_HIGHLY_CORRELATED_FEATURES: - COMPUTE: True + COMPUTE: False MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5 CORR_THRESHOLD: 0.95 SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R @@ -672,17 +673,17 @@ ALL_CLEANING_INDIVIDUAL: ALL_CLEANING_OVERALL: PROVIDERS: RAPIDS: - COMPUTE: False + COMPUTE: True IMPUTE_SELECTED_EVENT_FEATURES: COMPUTE: True MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33 COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable COLS_VAR_THRESHOLD: True - ROWS_NAN_THRESHOLD: 0.3 # set to 1 to disable + ROWS_NAN_THRESHOLD: 1 # set to 1 to disable DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES - DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable + DATA_YIELD_RATIO_THRESHOLD: 0.3 # set to 0 to disable DROP_HIGHLY_CORRELATED_FEATURES: - COMPUTE: True + COMPUTE: False MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5 CORR_THRESHOLD: 0.95 SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R @@ -691,12 +692,14 @@ ALL_CLEANING_OVERALL: ######################################################################################################################## # Z-score standardization # ######################################################################################################################## + STANDARDIZATION: PROVIDERS: CR: COMPUTE: True SRC_SCRIPT: src/features/standardization/main.py + ######################################################################################################################## # Baseline # ######################################################################################################################## @@ -716,4 +719,3 @@ PARAMS_FOR_ANALYSIS: TARGET: COMPUTE: True LABEL: PANAS_negative_affect_mean - diff --git a/rules/preprocessing.smk b/rules/preprocessing.smk index 07ffc390..fb583459 100644 --- a/rules/preprocessing.smk +++ b/rules/preprocessing.smk @@ -4,6 +4,36 @@ rule create_example_participant_files: shell: "echo 'PHONE:\n DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524]\n PLATFORMS: [android]\n LABEL: test-01\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\nFITBIT:\n DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524]\n LABEL: test-01\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\n' >> ./data/external/participant_files/example01.yaml && echo 'PHONE:\n DEVICE_IDS: [13dbc8a3-dae3-4834-823a-4bc96a7d459d]\n PLATFORMS: [ios]\n LABEL: test-02\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\nFITBIT:\n DEVICE_IDS: [13dbc8a3-dae3-4834-823a-4bc96a7d459d]\n LABEL: test-02\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\n' >> ./data/external/participant_files/example02.yaml" +# rule query_usernames_device_empatica_ids: +# params: +# baseline_folder = "/mnt/e/STRAWbaseline/" +# output: +# usernames_file = config["CREATE_PARTICIPANT_FILES"]["USERNAMES_CSV"], +# timezone_file = config["TIMEZONE"]["MULTIPLE"]["TZ_FILE"] +# script: +# "../../participants/prepare_usernames_file.py" + +rule prepare_tzcodes_file: + input: + timezone_file = config["TIMEZONE"]["MULTIPLE"]["TZ_FILE"] + output: + tzcodes_file = config["TIMEZONE"]["MULTIPLE"]["TZCODES_FILE"] + script: + "../tools/create_multi_timezones_file.py" + +rule prepare_participants_csv: + input: + username_list = config["CREATE_PARTICIPANT_FILES"]["USERNAMES_CSV"] + params: + data_configuration = config["PHONE_DATA_STREAMS"][config["PHONE_DATA_STREAMS"]["USE"]], + participants_table = "participants", + device_id_table = "esm", + start_end_date_table = "esm" + output: + participants_file = config["CREATE_PARTICIPANT_FILES"]["CSV_FILE_PATH"] + script: + "../src/data/translate_usernames_into_participants_data.R" + rule create_participants_files: input: participants_file = config["CREATE_PARTICIPANT_FILES"]["CSV_FILE_PATH"] @@ -218,4 +248,4 @@ rule empatica_readable_datetime: output: "data/raw/{pid}/empatica_{sensor}_with_datetime.csv" script: - "../src/data/datetime/readable_datetime.R" \ No newline at end of file + "../src/data/datetime/readable_datetime.R" diff --git a/src/data/create_participants_files.R b/src/data/create_participants_files.R index 5f67946a..a136abeb 100644 --- a/src/data/create_participants_files.R +++ b/src/data/create_participants_files.R @@ -58,7 +58,7 @@ participants %>% lines <- append(lines, empty_fitbit) if(add_empatica_section == TRUE && !is.na(row[empatica_device_id_column])){ - lines <- append(lines, c("EMPATICA:", paste0(" DEVICE_IDS: [",row[empatica_device_id_column],"]"), + lines <- append(lines, c("EMPATICA:", paste0(" DEVICE_IDS: [",row$label,"]"), paste(" LABEL:",row$label), paste(" START_DATE:", start_date), paste(" END_DATE:", end_date))) } else lines <- append(lines, empty_empatica)