Reseting files to defaults - for Minimal Working Example

sociality-task
Primoz 2022-03-16 13:30:19 +00:00
parent 50358978cc
commit dc2b462145
4 changed files with 48 additions and 123 deletions

View File

@ -5,7 +5,6 @@ include: "rules/common.smk"
include: "rules/renv.smk"
include: "rules/preprocessing.smk"
include: "rules/features.smk"
include: "rules/models.smk"
include: "rules/reports.smk"
import itertools
@ -403,12 +402,6 @@ for provider in config["ALL_CLEANING_OVERALL"]["PROVIDERS"].keys():
if config["ALL_CLEANING_OVERALL"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/processed/features/all_participants/all_sensor_features_cleaned_" + provider.lower() +".csv"))
# Baseline features
files_to_compute.extend(expand("data/raw/baseline_merged.csv"))
files_to_compute.extend(expand("data/raw/{pid}/participant_baseline_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/baseline_questionnaires.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/baseline_features.csv", pid=config["PIDS"]))
rule all:
input:
files_to_compute

View File

@ -3,38 +3,36 @@
########################################################################################################################
# See https://www.rapids.science/latest/setup/configuration/#participant-files
PIDS: ['p031', 'p032', 'p033', 'p034', 'p035', 'p036', 'p037', 'p038', 'p039', 'p040', 'p042', 'p043', 'p044', 'p045', 'p046', 'p049', 'p050', 'p052', 'p053', 'p054', 'p055', 'p057', 'p058', 'p059', 'p060', 'p061', 'p062', 'p064', 'p067', 'p068', 'p069', 'p070', 'p071', 'p072', 'p073', 'p074', 'p075', 'p076', 'p077', 'p078', 'p079', 'p080', 'p081', 'p082', 'p083', 'p084', 'p085', 'p086', 'p088', 'p089', 'p090', 'p091', 'p092', 'p093', 'p106', 'p107']
PIDS: [p01]
# See https://www.rapids.science/latest/setup/configuration/#automatic-creation-of-participant-files
CREATE_PARTICIPANT_FILES:
USERNAMES_CSV: "data/external/main_study_usernames.csv"
CSV_FILE_PATH: "data/external/main_study_participants.csv" # see docs for required format
CSV_FILE_PATH: "data/external/example_participants.csv" # see docs for required format
PHONE_SECTION:
ADD: True
IGNORED_DEVICE_IDS: []
FITBIT_SECTION:
ADD: False
ADD: True
IGNORED_DEVICE_IDS: []
EMPATICA_SECTION:
ADD: False
ADD: True
IGNORED_DEVICE_IDS: []
# See https://www.rapids.science/latest/setup/configuration/#time-segments
TIME_SEGMENTS: &time_segments
TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT
FILE: "data/external/timesegments_daily.csv"
INCLUDE_PAST_PERIODIC_SEGMENTS: TRUE # Only relevant if TYPE=PERIODIC, see docs
FILE: "data/external/timesegments_periodic.csv"
INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, see docs
# See https://www.rapids.science/latest/setup/configuration/#timezone-of-your-study
TIMEZONE:
TYPE: MULTIPLE
TYPE: SINGLE
SINGLE:
TZCODE: Europe/Ljubljana
TZCODE: America/New_York
MULTIPLE:
TZ_FILE: data/external/timezone.csv
TZCODES_FILE: data/external/multiple_timezones.csv
IF_MISSING_TZCODE: USE_DEFAULT
DEFAULT_TZCODE: Europe/Ljubljana
TZCODES_FILE: data/external/multiple_timezones_example.csv
IF_MISSING_TZCODE: STOP
DEFAULT_TZCODE: America/New_York
FITBIT:
ALLOW_MULTIPLE_TZ_PER_DEVICE: False
INFER_FROM_SMARTPHONE_TZ: False
@ -45,15 +43,12 @@ TIMEZONE:
# See https://www.rapids.science/latest/setup/configuration/#data-stream-configuration
PHONE_DATA_STREAMS:
USE: aware_postgresql
USE: aware_csv
# AVAILABLE:
aware_mysql:
DATABASE_GROUP: MY_GROUP
aware_postgresql:
DATABASE_GROUP: PSQL_STRAW
aware_csv:
FOLDER: data/external/aware_csv
@ -82,12 +77,12 @@ PHONE_ACCELEROMETER:
# See https://www.rapids.science/latest/features/phone-activity-recognition/
PHONE_ACTIVITY_RECOGNITION:
CONTAINER:
ANDROID: google_ar
ANDROID: plugin_google_activity_recognition
IOS: plugin_ios_activity_recognition
EPISODE_THRESHOLD_BETWEEN_ROWS: 5 # minutes. Max time difference for two consecutive rows to be considered within the same AR episode.
PROVIDERS:
RAPIDS:
COMPUTE: True
COMPUTE: False
FEATURES: ["count", "mostcommonactivity", "countuniqueactivities", "durationstationary", "durationmobile", "durationvehicle"]
ACTIVITY_CLASSES:
STATIONARY: ["still", "tilting"]
@ -107,17 +102,16 @@ PHONE_APPLICATIONS_CRASHES:
# See https://www.rapids.science/latest/features/phone-applications-foreground/
PHONE_APPLICATIONS_FOREGROUND:
CONTAINER: applications
CONTAINER: applications_foreground
APPLICATION_CATEGORIES:
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv"
PACKAGE_NAMES_HASHED: True
UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
PROVIDERS:
RAPIDS:
COMPUTE: True
INCLUDE_EPISODE_FEATURES: True
COMPUTE: False
INCLUDE_EPISODE_FEATURES: False
SINGLE_CATEGORIES: ["all", "email"]
MULTIPLE_CATEGORIES:
social: ["socialnetworks", "socialmediatools"]
@ -127,7 +121,7 @@ PHONE_APPLICATIONS_FOREGROUND:
dating: ["com.tinder", "com.relance.happycouple", "com.kiwi.joyride"]
SINGLE_APPS: ["top1global", "com.facebook.moments", "com.google.android.youtube", "com.twitter.android"] # There's no entropy for single apps
EXCLUDED_CATEGORIES: []
EXCLUDED_APPS: ["com.fitbit.FitbitMobile", "com.aware.plugin.upmc.cancer"] # TODO list system apps?
EXCLUDED_APPS: ["com.fitbit.FitbitMobile", "com.aware.plugin.upmc.cancer"]
FEATURES:
APP_EVENTS: ["countevent", "timeoffirstuse", "timeoflastuse", "frequencyentropy"]
APP_EPISODES: ["countepisode", "minduration", "maxduration", "meanduration", "sumduration"]
@ -137,7 +131,7 @@ PHONE_APPLICATIONS_FOREGROUND:
# See https://www.rapids.science/latest/features/phone-applications-notifications/
PHONE_APPLICATIONS_NOTIFICATIONS:
CONTAINER: notifications
CONTAINER: applications_notifications
APPLICATION_CATEGORIES:
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv"
@ -151,7 +145,7 @@ PHONE_BATTERY:
EPISODE_THRESHOLD_BETWEEN_ROWS: 30 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode.
PROVIDERS:
RAPIDS:
COMPUTE: True
COMPUTE: False
FEATURES: ["countdischarge", "sumdurationdischarge", "countcharge", "sumdurationcharge", "avgconsumptionrate", "maxconsumptionrate"]
SRC_SCRIPT: src/features/phone_battery/rapids/main.py
@ -160,12 +154,12 @@ PHONE_BLUETOOTH:
CONTAINER: bluetooth
PROVIDERS:
RAPIDS:
COMPUTE: True
COMPUTE: False
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
SRC_SCRIPT: src/features/phone_bluetooth/rapids/main.R
DORYAB:
COMPUTE: True
COMPUTE: False
FEATURES:
ALL:
DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"]
@ -183,7 +177,7 @@ PHONE_BLUETOOTH:
# See https://www.rapids.science/latest/features/phone-calls/
PHONE_CALLS:
CONTAINER: call
CONTAINER: calls.csv
PROVIDERS:
RAPIDS:
COMPUTE: True
@ -196,7 +190,7 @@ PHONE_CALLS:
SRC_SCRIPT: src/features/phone_calls/rapids/main.R
# See https://www.rapids.science/latest/features/phone-conversation/
PHONE_CONVERSATION: # TODO Adapt for speech
PHONE_CONVERSATION:
CONTAINER:
ANDROID: plugin_studentlife_audio_android
IOS: plugin_studentlife_audio
@ -215,21 +209,10 @@ PHONE_CONVERSATION: # TODO Adapt for speech
# See https://www.rapids.science/latest/features/phone-data-yield/
PHONE_DATA_YIELD:
SENSORS: [#PHONE_ACCELEROMETER,
PHONE_ACTIVITY_RECOGNITION,
PHONE_APPLICATIONS_FOREGROUND,
PHONE_APPLICATIONS_NOTIFICATIONS,
PHONE_BATTERY,
PHONE_BLUETOOTH,
PHONE_CALLS,
PHONE_LIGHT,
PHONE_LOCATIONS,
PHONE_MESSAGES,
PHONE_SCREEN,
PHONE_WIFI_VISIBLE]
SENSORS: []
PROVIDERS:
RAPIDS:
COMPUTE: True
COMPUTE: False
FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours]
MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1, minimum percentage of valid minutes in an hour to be considered valid.
SRC_SCRIPT: src/features/phone_data_yield/rapids/main.R
@ -245,10 +228,10 @@ PHONE_KEYBOARD:
# See https://www.rapids.science/latest/features/phone-light/
PHONE_LIGHT:
CONTAINER: light_sensor
CONTAINER: light
PROVIDERS:
RAPIDS:
COMPUTE: True
COMPUTE: False
FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"]
SRC_SCRIPT: src/features/phone_light/rapids/main.py
@ -262,7 +245,7 @@ PHONE_LOCATIONS:
PROVIDERS:
DORYAB:
COMPUTE: True
COMPUTE: False
FEATURES: ["locationvariance","loglocationvariance","totaldistance","avgspeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","avglengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome", "homelabel"]
DBSCAN_EPS: 100 # meters
DBSCAN_MINSAMPLES: 5
@ -277,7 +260,7 @@ PHONE_LOCATIONS:
SRC_SCRIPT: src/features/phone_locations/doryab/main.py
BARNETT:
COMPUTE: True
COMPUTE: False
FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"]
IF_MULTIPLE_TIMEZONES: USE_MOST_COMMON
MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features
@ -292,10 +275,10 @@ PHONE_LOG:
# See https://www.rapids.science/latest/features/phone-messages/
PHONE_MESSAGES:
CONTAINER: sms
CONTAINER: messages
PROVIDERS:
RAPIDS:
COMPUTE: True
COMPUTE: False
MESSAGES_TYPES : [received, sent]
FEATURES:
received: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact]
@ -307,7 +290,7 @@ PHONE_SCREEN:
CONTAINER: screen
PROVIDERS:
RAPIDS:
COMPUTE: True
COMPUTE: False
REFERENCE_HOUR_FIRST_USE: 0
IGNORE_EPISODES_SHORTER_THAN: 0 # in minutes, set to 0 to disable
IGNORE_EPISODES_LONGER_THAN: 360 # in minutes, set to 0 to disable
@ -329,7 +312,7 @@ PHONE_WIFI_VISIBLE:
CONTAINER: wifi
PROVIDERS:
RAPIDS:
COMPUTE: True
COMPUTE: False
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
SRC_SCRIPT: src/features/phone_wifi_visible/rapids/main.R
@ -557,16 +540,16 @@ EMPATICA_TAGS:
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#1-histograms-of-phone-data-yield
HISTOGRAM_PHONE_DATA_YIELD:
PLOT: True
PLOT: False
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#2-heatmaps-of-overall-data-yield
HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT:
PLOT: True
PLOT: False
TIME: RELATIVE_TIME # ABSOLUTE_TIME or RELATIVE_TIME
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#3-heatmap-of-recorded-phone-sensors
HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT:
PLOT: True
PLOT: False
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#4-heatmap-of-sensor-row-count
HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT:
@ -577,7 +560,7 @@ HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT:
# See https://www.rapids.science/latest/visualizations/feature-visualizations/#1-heatmap-correlation-matrix
HEATMAP_FEATURE_CORRELATION_MATRIX:
PLOT: True
PLOT: False
MIN_ROWS_RATIO: 0.5
CORR_THRESHOLD: 0.1
CORR_METHOD: "pearson" # choose from {"pearson", "kendall", "spearman"}
@ -590,17 +573,17 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
ALL_CLEANING_INDIVIDUAL:
PROVIDERS:
RAPIDS:
COMPUTE: True
COMPUTE: False
IMPUTE_SELECTED_EVENT_FEATURES:
COMPUTE: True
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable
COLS_VAR_THRESHOLD: True
ROWS_NAN_THRESHOLD: 1 # set to 1 to disable
ROWS_NAN_THRESHOLD: 0.3 # set to 1 to disable
DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
DATA_YIELD_RATIO_THRESHOLD: 0.3 # set to 0 to disable
DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
DROP_HIGHLY_CORRELATED_FEATURES:
COMPUTE: False
COMPUTE: True
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
CORR_THRESHOLD: 0.95
SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R
@ -608,32 +591,17 @@ ALL_CLEANING_INDIVIDUAL:
ALL_CLEANING_OVERALL:
PROVIDERS:
RAPIDS:
COMPUTE: True
COMPUTE: False
IMPUTE_SELECTED_EVENT_FEATURES:
COMPUTE: True
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable
COLS_VAR_THRESHOLD: True
ROWS_NAN_THRESHOLD: 1 # set to 1 to disable
ROWS_NAN_THRESHOLD: 0.3 # set to 1 to disable
DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
DATA_YIELD_RATIO_THRESHOLD: 0.3 # set to 0 to disable
DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
DROP_HIGHLY_CORRELATED_FEATURES:
COMPUTE: False
COMPUTE: True
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
CORR_THRESHOLD: 0.95
SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R
########################################################################################################################
# Analysis Workflow Example #
########################################################################################################################
PARAMS_FOR_ANALYSIS:
BASELINE:
FOLDER: data/external/baseline
CONTAINER: [results-survey637813_final.csv, # Slovenia
results-survey358134_final.csv, # Belgium 1
results-survey413767_final.csv # Belgium 2
]
QUESTION_LIST: survey637813+question_text.csv
FEATURES: [age, gender, startlanguage, demand, control, demand_control_ratio]
CATEGORICAL_FEATURES: [gender]
SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R

View File

@ -1,9 +1,3 @@
label,start_time,length,repeats_on,repeats_value
threeday,00:00:00,2D 23H 59M 59S,every_day,0
daily,00:00:00,23H 59M 59S,every_day,0
morning,06:00:00,5H 59M 59S,every_day,0
afternoon,12:00:00,5H 59M 59S,every_day,0
evening,18:00:00,5H 59M 59S,every_day,0
night,00:00:00,5H 59M 59S,every_day,0
two_weeks_overlapping,00:00:00,13D 23H 59M 59S,every_day,0
weekends,00:00:00,2D 23H 59M 59S,wday,5

1 label start_time length repeats_on repeats_value
threeday 00:00:00 2D 23H 59M 59S every_day 0
2 daily 00:00:00 23H 59M 59S every_day 0
morning 06:00:00 5H 59M 59S every_day 0
afternoon 12:00:00 5H 59M 59S every_day 0
evening 18:00:00 5H 59M 59S every_day 0
3 night 00:00:00 5H 59M 59S every_day 0
two_weeks_overlapping 00:00:00 13D 23H 59M 59S every_day 0
weekends 00:00:00 2D 23H 59M 59S wday 5

View File

@ -4,36 +4,6 @@ rule create_example_participant_files:
shell:
"echo 'PHONE:\n DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524]\n PLATFORMS: [android]\n LABEL: test-01\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\nFITBIT:\n DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524]\n LABEL: test-01\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\n' >> ./data/external/participant_files/example01.yaml && echo 'PHONE:\n DEVICE_IDS: [13dbc8a3-dae3-4834-823a-4bc96a7d459d]\n PLATFORMS: [ios]\n LABEL: test-02\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\nFITBIT:\n DEVICE_IDS: [13dbc8a3-dae3-4834-823a-4bc96a7d459d]\n LABEL: test-02\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\n' >> ./data/external/participant_files/example02.yaml"
rule query_usernames_device_empatica_ids:
params:
baseline_folder = "/mnt/e/STRAWbaseline/"
output:
usernames_file = config["CREATE_PARTICIPANT_FILES"]["USERNAMES_CSV"],
timezone_file = config["TIMEZONE"]["MULTIPLE"]["TZ_FILE"]
script:
"../../participants/prepare_usernames_file.py"
rule prepare_tzcodes_file:
input:
timezone_file = config["TIMEZONE"]["MULTIPLE"]["TZ_FILE"]
output:
tzcodes_file = config["TIMEZONE"]["MULTIPLE"]["TZCODES_FILE"]
script:
"../tools/create_multi_timezones_file.py"
rule prepare_participants_csv:
input:
username_list = config["CREATE_PARTICIPANT_FILES"]["USERNAMES_CSV"]
params:
data_configuration = config["PHONE_DATA_STREAMS"][config["PHONE_DATA_STREAMS"]["USE"]],
participants_table = "participants",
device_id_table = "esm",
start_end_date_table = "esm"
output:
participants_file = config["CREATE_PARTICIPANT_FILES"]["CSV_FILE_PATH"]
script:
"../src/data/translate_usernames_into_participants_data.R"
rule create_participants_files:
input:
participants_file = config["CREATE_PARTICIPANT_FILES"]["CSV_FILE_PATH"]
@ -249,4 +219,4 @@ rule empatica_readable_datetime:
output:
"data/raw/{pid}/empatica_{sensor}_with_datetime.csv"
script:
"../src/data/datetime/readable_datetime.R"
"../src/data/datetime/readable_datetime.R"