From 5178be585def1885d943a011b489e63f72cc421c Mon Sep 17 00:00:00 2001 From: Meng Li <34143965+Meng6@users.noreply.github.com> Date: Wed, 25 Nov 2020 22:35:38 -0500 Subject: [PATCH] Rename modeling.py to modelling.py & Update example_config.yaml --- example_profile/example_config.yaml | 273 +++++++++--------- rules/models.smk | 17 +- src/models/workflow_example/baselines.py | 2 +- .../{modeling.py => modelling.py} | 2 +- .../{modeling_utils.py => modelling_utils.py} | 0 5 files changed, 146 insertions(+), 148 deletions(-) rename src/models/workflow_example/{modeling.py => modelling.py} (98%) rename src/models/workflow_example/{modeling_utils.py => modelling_utils.py} (100%) diff --git a/example_profile/example_config.yaml b/example_profile/example_config.yaml index 70296f31..5e27104d 100644 --- a/example_profile/example_config.yaml +++ b/example_profile/example_config.yaml @@ -31,8 +31,11 @@ DAY_SEGMENTS: &day_segments FILE: "example_profile/exampleworkflow_daysegments.csv" INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, see docs -############## PHONE ########################################################### -################################################################################ + + +######################################################################################################################## +# PHONE # +######################################################################################################################## # See https://www.rapids.science/setup/configuration/#device-data-source-configuration PHONE_DATA_CONFIGURATION: @@ -45,31 +48,81 @@ PHONE_DATA_CONFIGURATION: VALUE: *timezone # IF TYPE=SINGLE, see docs # Sensors ------ - -PHONE_DATA_YIELD: - SENSORS: [PHONE_ACCELEROMETER, PHONE_ACTIVITY_RECOGNITION, PHONE_APPLICATIONS_FOREGROUND, PHONE_BATTERY, PHONE_BLUETOOTH, PHONE_CALLS, PHONE_CONVERSATION, PHONE_LIGHT, PHONE_LOCATIONS, PHONE_MESSAGES, PHONE_SCREEN, PHONE_WIFI_CONNECTED, PHONE_WIFI_VISIBLE] + +PHONE_ACCELEROMETER: + TABLE: accelerometer + PROVIDERS: + RAPIDS: + COMPUTE: False + FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"] + SRC_FOLDER: "rapids" # inside src/features/phone_accelerometer + SRC_LANGUAGE: "python" + + PANDA: + COMPUTE: False + VALID_SENSED_MINUTES: False + FEATURES: + exertional_activity_episode: ["sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"] + nonexertional_activity_episode: ["sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"] + SRC_FOLDER: "panda" # inside src/features/phone_accelerometer + SRC_LANGUAGE: "python" + +PHONE_ACTIVITY_RECOGNITION: + TABLE: + ANDROID: plugin_google_activity_recognition + IOS: plugin_ios_activity_recognition + EPISODE_THRESHOLD_BETWEEN_ROWS: 5 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode. PROVIDERS: RAPIDS: COMPUTE: True - FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours] - MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1 representing the number of minutes with at least - SRC_LANGUAGE: "r" - SRC_FOLDER: "rapids" # inside src/features/phone_data_yield + FEATURES: ["count", "mostcommonactivity", "countuniqueactivities", "durationstationary", "durationmobile", "durationvehicle"] + ACTIVITY_CLASSES: + STATIONARY: ["still", "tilting"] + MOBILE: ["on_foot", "walking", "running", "on_bicycle"] + VEHICLE: ["in_vehicle"] + SRC_FOLDER: "rapids" # inside src/features/phone_activity_recognition + SRC_LANGUAGE: "python" -# Communication SMS features config, TYPES and FEATURES keys need to match -PHONE_MESSAGES: - TABLE: messages +PHONE_APPLICATIONS_FOREGROUND: + TABLE: applications_foreground + APPLICATION_CATEGORIES: + CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store) + CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv" + UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE + SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway PROVIDERS: RAPIDS: COMPUTE: True - MESSAGES_TYPES : [received, sent] - FEATURES: - received: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact] - sent: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact] - SRC_LANGUAGE: "r" - SRC_FOLDER: "rapids" # inside src/features/phone_messages + SINGLE_CATEGORIES: ["all", "email"] + MULTIPLE_CATEGORIES: + social: ["socialnetworks", "socialmediatools"] + entertainment: ["entertainment", "gamingknowledge", "gamingcasual", "gamingadventure", "gamingstrategy", "gamingtoolscommunity", "gamingroleplaying", "gamingaction", "gaminglogic", "gamingsports", "gamingsimulation"] + SINGLE_APPS: ["top1global", "com.facebook.moments", "com.google.android.youtube", "com.twitter.android"] # There's no entropy for single apps + EXCLUDED_CATEGORIES: ["system_apps"] + EXCLUDED_APPS: ["com.fitbit.FitbitMobile", "com.aware.plugin.upmc.cancer"] + FEATURES: ["count", "timeoffirstuse", "timeoflastuse", "frequencyentropy"] + SRC_FOLDER: "rapids" # inside src/features/phone_applications_foreground + SRC_LANGUAGE: "python" + +PHONE_BATTERY: + TABLE: battery + EPISODE_THRESHOLD_BETWEEN_ROWS: 30 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode. + PROVIDERS: + RAPIDS: + COMPUTE: True + FEATURES: ["countdischarge", "sumdurationdischarge", "countcharge", "sumdurationcharge", "avgconsumptionrate", "maxconsumptionrate"] + SRC_FOLDER: "rapids" # inside src/features/phone_battery + SRC_LANGUAGE: "python" + +PHONE_BLUETOOTH: + TABLE: bluetooth + PROVIDERS: + RAPIDS: + COMPUTE: True + FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] + SRC_FOLDER: "rapids" # inside src/features/phone_bluetooth + SRC_LANGUAGE: "r" -# Communication call features config, TYPES and FEATURES keys need to match PHONE_CALLS: TABLE: calls PROVIDERS: @@ -83,6 +136,43 @@ PHONE_CALLS: SRC_LANGUAGE: "r" SRC_FOLDER: "rapids" # inside src/features/phone_calls +PHONE_CONVERSATION: + TABLE: + ANDROID: plugin_studentlife_audio_android + IOS: plugin_studentlife_audio + PROVIDERS: + RAPIDS: + COMPUTE: True + FEATURES: ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration", + "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","noisesumenergy", + "noiseavgenergy","noisesdenergy","noiseminenergy","noisemaxenergy","voicesumenergy", + "voiceavgenergy","voicesdenergy","voiceminenergy","voicemaxenergy","silencesensedfraction","noisesensedfraction", + "voicesensedfraction","unknownsensedfraction","silenceexpectedfraction","noiseexpectedfraction","voiceexpectedfraction", + "unknownexpectedfraction","countconversation"] + RECORDING_MINUTES: 1 + PAUSED_MINUTES : 3 + SRC_FOLDER: "rapids" # inside src/features/phone_conversation + SRC_LANGUAGE: "python" + +PHONE_DATA_YIELD: + SENSORS: [PHONE_ACCELEROMETER, PHONE_ACTIVITY_RECOGNITION, PHONE_APPLICATIONS_FOREGROUND, PHONE_BATTERY, PHONE_BLUETOOTH, PHONE_CALLS, PHONE_CONVERSATION, PHONE_LIGHT, PHONE_LOCATIONS, PHONE_MESSAGES, PHONE_SCREEN, PHONE_WIFI_CONNECTED, PHONE_WIFI_VISIBLE] + PROVIDERS: + RAPIDS: + COMPUTE: True + FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours] + MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1 representing the number of minutes with at least + SRC_LANGUAGE: "r" + SRC_FOLDER: "rapids" # inside src/features/phone_data_yield + +PHONE_LIGHT: + TABLE: light + PROVIDERS: + RAPIDS: + COMPUTE: True + FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"] + SRC_FOLDER: "rapids" # inside src/features/phone_light + SRC_LANGUAGE: "python" + PHONE_LOCATIONS: TABLE: locations LOCATIONS_TO_USE: FUSED_RESAMPLED # ALL, GPS OR FUSED_RESAMPLED @@ -110,41 +200,17 @@ PHONE_LOCATIONS: SRC_FOLDER: "barnett" # inside src/features/phone_locations SRC_LANGUAGE: "r" -PHONE_BLUETOOTH: - TABLE: bluetooth +PHONE_MESSAGES: + TABLE: messages PROVIDERS: RAPIDS: COMPUTE: True - FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] - SRC_FOLDER: "rapids" # inside src/features/phone_bluetooth + MESSAGES_TYPES : [received, sent] + FEATURES: + received: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact] + sent: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact] SRC_LANGUAGE: "r" - - -PHONE_ACTIVITY_RECOGNITION: - TABLE: - ANDROID: plugin_google_activity_recognition - IOS: plugin_ios_activity_recognition - EPISODE_THRESHOLD_BETWEEN_ROWS: 5 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode. - PROVIDERS: - RAPIDS: - COMPUTE: True - FEATURES: ["count", "mostcommonactivity", "countuniqueactivities", "durationstationary", "durationmobile", "durationvehicle"] - ACTIVITY_CLASSES: - STATIONARY: ["still", "tilting"] - MOBILE: ["on_foot", "walking", "running", "on_bicycle"] - VEHICLE: ["in_vehicle"] - SRC_FOLDER: "rapids" # inside src/features/phone_activity_recognition - SRC_LANGUAGE: "python" - -PHONE_BATTERY: - TABLE: battery - EPISODE_THRESHOLD_BETWEEN_ROWS: 30 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode. - PROVIDERS: - RAPIDS: - COMPUTE: True - FEATURES: ["countdischarge", "sumdurationdischarge", "countcharge", "sumdurationcharge", "avgconsumptionrate", "maxconsumptionrate"] - SRC_FOLDER: "rapids" # inside src/features/phone_battery - SRC_LANGUAGE: "python" + SRC_FOLDER: "rapids" # inside src/features/phone_messages PHONE_SCREEN: TABLE: screen @@ -159,53 +225,14 @@ PHONE_SCREEN: SRC_FOLDER: "rapids" # inside src/features/phone_screen SRC_LANGUAGE: "python" -PHONE_LIGHT: - TABLE: light +PHONE_WIFI_CONNECTED: + TABLE: "sensor_wifi" PROVIDERS: RAPIDS: COMPUTE: True - FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"] - SRC_FOLDER: "rapids" # inside src/features/phone_light - SRC_LANGUAGE: "python" - -PHONE_ACCELEROMETER: - TABLE: accelerometer - PROVIDERS: - RAPIDS: - COMPUTE: False - FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"] - SRC_FOLDER: "rapids" # inside src/features/phone_accelerometer - SRC_LANGUAGE: "python" - - PANDA: - COMPUTE: False - VALID_SENSED_MINUTES: False - FEATURES: - exertional_activity_episode: ["sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"] - nonexertional_activity_episode: ["sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"] - SRC_FOLDER: "panda" # inside src/features/phone_accelerometer - SRC_LANGUAGE: "python" - -PHONE_APPLICATIONS_FOREGROUND: - TABLE: applications_foreground - APPLICATION_CATEGORIES: - CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store) - CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv" - UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE - SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway - PROVIDERS: - RAPIDS: - COMPUTE: True - SINGLE_CATEGORIES: ["all", "email"] - MULTIPLE_CATEGORIES: - social: ["socialnetworks", "socialmediatools"] - entertainment: ["entertainment", "gamingknowledge", "gamingcasual", "gamingadventure", "gamingstrategy", "gamingtoolscommunity", "gamingroleplaying", "gamingaction", "gaminglogic", "gamingsports", "gamingsimulation"] - SINGLE_APPS: ["top1global", "com.facebook.moments", "com.google.android.youtube", "com.twitter.android"] # There's no entropy for single apps - EXCLUDED_CATEGORIES: ["system_apps"] - EXCLUDED_APPS: ["com.fitbit.FitbitMobile", "com.aware.plugin.upmc.cancer"] - FEATURES: ["count", "timeoffirstuse", "timeoflastuse", "frequencyentropy"] - SRC_FOLDER: "rapids" # inside src/features/phone_applications_foreground - SRC_LANGUAGE: "python" + FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] + SRC_FOLDER: "rapids" # inside src/features/phone_wifi_connected + SRC_LANGUAGE: "r" PHONE_WIFI_VISIBLE: TABLE: "wifi" @@ -216,36 +243,13 @@ PHONE_WIFI_VISIBLE: SRC_FOLDER: "rapids" # inside src/features/phone_wifi_visible SRC_LANGUAGE: "r" -PHONE_WIFI_CONNECTED: - TABLE: "sensor_wifi" - PROVIDERS: - RAPIDS: - COMPUTE: True - FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] - SRC_FOLDER: "rapids" # inside src/features/phone_wifi_connected - SRC_LANGUAGE: "r" -PHONE_CONVERSATION: - TABLE: - ANDROID: plugin_studentlife_audio_android - IOS: plugin_studentlife_audio - PROVIDERS: - RAPIDS: - COMPUTE: True - FEATURES: ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration", - "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","noisesumenergy", - "noiseavgenergy","noisesdenergy","noiseminenergy","noisemaxenergy","voicesumenergy", - "voiceavgenergy","voicesdenergy","voiceminenergy","voicemaxenergy","silencesensedfraction","noisesensedfraction", - "voicesensedfraction","unknownsensedfraction","silenceexpectedfraction","noiseexpectedfraction","voiceexpectedfraction", - "unknownexpectedfraction","countconversation"] - RECORDING_MINUTES: 1 - PAUSED_MINUTES : 3 - SRC_FOLDER: "rapids" # inside src/features/phone_conversation - SRC_LANGUAGE: "python" -############## FITBIT ########################################################## -################################################################################ +######################################################################################################################## +# FITBIT # +######################################################################################################################## +# See https://www.rapids.science/latest/setup/configuration/#device-data-source-configuration FITBIT_DATA_CONFIGURATION: SOURCE: TYPE: DATABASE # DATABASE or FILES (set each [FITBIT_SENSOR][TABLE] attribute with a table name or a file path accordingly) @@ -276,6 +280,17 @@ FITBIT_HEARTRATE_INTRADAY: SRC_FOLDER: "rapids" # inside src/features/fitbit_heartrate_intraday SRC_LANGUAGE: "python" +FITBIT_SLEEP_SUMMARY: + TABLE: fitbit_data + SLEEP_EPISODE_TIMESTAMP: end # summary sleep episodes are considered as events based on either the start timestamp or end timestamp. + PROVIDERS: + RAPIDS: + COMPUTE: True + FEATURES: ["countepisode", "avgefficiency", "sumdurationafterwakeup", "sumdurationasleep", "sumdurationawake", "sumdurationtofallasleep", "sumdurationinbed", "avgdurationafterwakeup", "avgdurationasleep", "avgdurationawake", "avgdurationtofallasleep", "avgdurationinbed"] + SLEEP_TYPES: ["main", "nap", "all"] + SRC_FOLDER: "rapids" # inside src/features/fitbit_sleep_summary + SRC_LANGUAGE: "python" + FITBIT_STEPS_SUMMARY: TABLE: fitbit_data PROVIDERS: @@ -299,19 +314,11 @@ FITBIT_STEPS_INTRADAY: SRC_FOLDER: "rapids" # inside src/features/fitbit_steps_intraday SRC_LANGUAGE: "python" -FITBIT_SLEEP_SUMMARY: - TABLE: fitbit_data - SLEEP_EPISODE_TIMESTAMP: end # summary sleep episodes are considered as events based on either the start timestamp or end timestamp. - PROVIDERS: - RAPIDS: - COMPUTE: True - FEATURES: ["countepisode", "avgefficiency", "sumdurationafterwakeup", "sumdurationasleep", "sumdurationawake", "sumdurationtofallasleep", "sumdurationinbed", "avgdurationafterwakeup", "avgdurationasleep", "avgdurationawake", "avgdurationtofallasleep", "avgdurationinbed"] - SLEEP_TYPES: ["main", "nap", "all"] - SRC_FOLDER: "rapids" # inside src/features/fitbit_sleep_summary - SRC_LANGUAGE: "python" -### Analysis Workflow Example ################################################## -################################################################################ + +######################################################################################################################## +# Analysis Workflow Example # +######################################################################################################################## PARAMS_FOR_ANALYSIS: CATEGORICAL_OPERATORS: [mostcommon] diff --git a/rules/models.smk b/rules/models.smk index b0742e32..acca3b72 100644 --- a/rules/models.smk +++ b/rules/models.smk @@ -79,15 +79,6 @@ rule clean_sensor_features_for_all_participants: script: "../src/models/workflow_example/clean_sensor_features.R" - - - - - - - - - rule merge_features_and_targets_for_individual_model: input: cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned.csv", @@ -133,7 +124,7 @@ rule baselines_for_population_model: script: "../src/models/workflow_example/baselines.py" -rule modeling_for_individual_participants: +rule modelling_for_individual_participants: input: data = "data/processed/models/individual_model/{pid}/input.csv" params: @@ -151,9 +142,9 @@ rule modeling_for_individual_participants: log: "data/processed/models/individual_model/{pid}/output_{cv_method}/{model}/{scaler}/notes.log" script: - "../src/models/workflow_example/modeling.py" + "../src/models/workflow_example/modelling.py" -rule modeling_for_all_participants: +rule modelling_for_all_participants: input: data = "data/processed/models/population_model/input.csv" params: @@ -171,4 +162,4 @@ rule modeling_for_all_participants: log: "data/processed/models/population_model/output_{cv_method}/{model}/{scaler}/notes.log" script: - "../src/models/workflow_example/modeling.py" + "../src/models/workflow_example/modelling.py" diff --git a/src/models/workflow_example/baselines.py b/src/models/workflow_example/baselines.py index 8db60593..9c1ae564 100644 --- a/src/models/workflow_example/baselines.py +++ b/src/models/workflow_example/baselines.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd from statistics import mean -from modeling_utils import getMetrics, createPipeline +from modelling_utils import getMetrics, createPipeline from sklearn.model_selection import LeaveOneOut diff --git a/src/models/workflow_example/modeling.py b/src/models/workflow_example/modelling.py similarity index 98% rename from src/models/workflow_example/modeling.py rename to src/models/workflow_example/modelling.py index 41323600..a83ee0c5 100644 --- a/src/models/workflow_example/modeling.py +++ b/src/models/workflow_example/modelling.py @@ -1,6 +1,6 @@ import pandas as pd import numpy as np -from modeling_utils import getMatchingColNames, getNormAllParticipantsScaler, getMetrics, getFeatureImportances, createPipeline +from modelling_utils import getMatchingColNames, getNormAllParticipantsScaler, getMetrics, getFeatureImportances, createPipeline from sklearn.model_selection import LeaveOneOut, GridSearchCV diff --git a/src/models/workflow_example/modeling_utils.py b/src/models/workflow_example/modelling_utils.py similarity index 100% rename from src/models/workflow_example/modeling_utils.py rename to src/models/workflow_example/modelling_utils.py