Rename modeling.py to modelling.py & Update example_config.yaml

pull/103/head
Meng Li 2020-11-25 22:35:38 -05:00
parent 5f51c94ac6
commit 5178be585d
5 changed files with 146 additions and 148 deletions

View File

@ -31,8 +31,11 @@ DAY_SEGMENTS: &day_segments
FILE: "example_profile/exampleworkflow_daysegments.csv"
INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, see docs
############## PHONE ###########################################################
################################################################################
########################################################################################################################
# PHONE #
########################################################################################################################
# See https://www.rapids.science/setup/configuration/#device-data-source-configuration
PHONE_DATA_CONFIGURATION:
@ -46,30 +49,80 @@ PHONE_DATA_CONFIGURATION:
# Sensors ------
PHONE_DATA_YIELD:
SENSORS: [PHONE_ACCELEROMETER, PHONE_ACTIVITY_RECOGNITION, PHONE_APPLICATIONS_FOREGROUND, PHONE_BATTERY, PHONE_BLUETOOTH, PHONE_CALLS, PHONE_CONVERSATION, PHONE_LIGHT, PHONE_LOCATIONS, PHONE_MESSAGES, PHONE_SCREEN, PHONE_WIFI_CONNECTED, PHONE_WIFI_VISIBLE]
PHONE_ACCELEROMETER:
TABLE: accelerometer
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours]
MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1 representing the number of minutes with at least
SRC_LANGUAGE: "r"
SRC_FOLDER: "rapids" # inside src/features/phone_data_yield
COMPUTE: False
FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
SRC_FOLDER: "rapids" # inside src/features/phone_accelerometer
SRC_LANGUAGE: "python"
# Communication SMS features config, TYPES and FEATURES keys need to match
PHONE_MESSAGES:
TABLE: messages
PROVIDERS:
RAPIDS:
COMPUTE: True
MESSAGES_TYPES : [received, sent]
PANDA:
COMPUTE: False
VALID_SENSED_MINUTES: False
FEATURES:
received: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact]
sent: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact]
SRC_LANGUAGE: "r"
SRC_FOLDER: "rapids" # inside src/features/phone_messages
exertional_activity_episode: ["sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"]
nonexertional_activity_episode: ["sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"]
SRC_FOLDER: "panda" # inside src/features/phone_accelerometer
SRC_LANGUAGE: "python"
PHONE_ACTIVITY_RECOGNITION:
TABLE:
ANDROID: plugin_google_activity_recognition
IOS: plugin_ios_activity_recognition
EPISODE_THRESHOLD_BETWEEN_ROWS: 5 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode.
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: ["count", "mostcommonactivity", "countuniqueactivities", "durationstationary", "durationmobile", "durationvehicle"]
ACTIVITY_CLASSES:
STATIONARY: ["still", "tilting"]
MOBILE: ["on_foot", "walking", "running", "on_bicycle"]
VEHICLE: ["in_vehicle"]
SRC_FOLDER: "rapids" # inside src/features/phone_activity_recognition
SRC_LANGUAGE: "python"
PHONE_APPLICATIONS_FOREGROUND:
TABLE: applications_foreground
APPLICATION_CATEGORIES:
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv"
UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
PROVIDERS:
RAPIDS:
COMPUTE: True
SINGLE_CATEGORIES: ["all", "email"]
MULTIPLE_CATEGORIES:
social: ["socialnetworks", "socialmediatools"]
entertainment: ["entertainment", "gamingknowledge", "gamingcasual", "gamingadventure", "gamingstrategy", "gamingtoolscommunity", "gamingroleplaying", "gamingaction", "gaminglogic", "gamingsports", "gamingsimulation"]
SINGLE_APPS: ["top1global", "com.facebook.moments", "com.google.android.youtube", "com.twitter.android"] # There's no entropy for single apps
EXCLUDED_CATEGORIES: ["system_apps"]
EXCLUDED_APPS: ["com.fitbit.FitbitMobile", "com.aware.plugin.upmc.cancer"]
FEATURES: ["count", "timeoffirstuse", "timeoflastuse", "frequencyentropy"]
SRC_FOLDER: "rapids" # inside src/features/phone_applications_foreground
SRC_LANGUAGE: "python"
PHONE_BATTERY:
TABLE: battery
EPISODE_THRESHOLD_BETWEEN_ROWS: 30 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode.
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: ["countdischarge", "sumdurationdischarge", "countcharge", "sumdurationcharge", "avgconsumptionrate", "maxconsumptionrate"]
SRC_FOLDER: "rapids" # inside src/features/phone_battery
SRC_LANGUAGE: "python"
PHONE_BLUETOOTH:
TABLE: bluetooth
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
SRC_FOLDER: "rapids" # inside src/features/phone_bluetooth
SRC_LANGUAGE: "r"
# Communication call features config, TYPES and FEATURES keys need to match
PHONE_CALLS:
TABLE: calls
PROVIDERS:
@ -83,6 +136,43 @@ PHONE_CALLS:
SRC_LANGUAGE: "r"
SRC_FOLDER: "rapids" # inside src/features/phone_calls
PHONE_CONVERSATION:
TABLE:
ANDROID: plugin_studentlife_audio_android
IOS: plugin_studentlife_audio
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration",
"sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","noisesumenergy",
"noiseavgenergy","noisesdenergy","noiseminenergy","noisemaxenergy","voicesumenergy",
"voiceavgenergy","voicesdenergy","voiceminenergy","voicemaxenergy","silencesensedfraction","noisesensedfraction",
"voicesensedfraction","unknownsensedfraction","silenceexpectedfraction","noiseexpectedfraction","voiceexpectedfraction",
"unknownexpectedfraction","countconversation"]
RECORDING_MINUTES: 1
PAUSED_MINUTES : 3
SRC_FOLDER: "rapids" # inside src/features/phone_conversation
SRC_LANGUAGE: "python"
PHONE_DATA_YIELD:
SENSORS: [PHONE_ACCELEROMETER, PHONE_ACTIVITY_RECOGNITION, PHONE_APPLICATIONS_FOREGROUND, PHONE_BATTERY, PHONE_BLUETOOTH, PHONE_CALLS, PHONE_CONVERSATION, PHONE_LIGHT, PHONE_LOCATIONS, PHONE_MESSAGES, PHONE_SCREEN, PHONE_WIFI_CONNECTED, PHONE_WIFI_VISIBLE]
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours]
MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1 representing the number of minutes with at least
SRC_LANGUAGE: "r"
SRC_FOLDER: "rapids" # inside src/features/phone_data_yield
PHONE_LIGHT:
TABLE: light
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"]
SRC_FOLDER: "rapids" # inside src/features/phone_light
SRC_LANGUAGE: "python"
PHONE_LOCATIONS:
TABLE: locations
LOCATIONS_TO_USE: FUSED_RESAMPLED # ALL, GPS OR FUSED_RESAMPLED
@ -110,41 +200,17 @@ PHONE_LOCATIONS:
SRC_FOLDER: "barnett" # inside src/features/phone_locations
SRC_LANGUAGE: "r"
PHONE_BLUETOOTH:
TABLE: bluetooth
PHONE_MESSAGES:
TABLE: messages
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
SRC_FOLDER: "rapids" # inside src/features/phone_bluetooth
MESSAGES_TYPES : [received, sent]
FEATURES:
received: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact]
sent: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact]
SRC_LANGUAGE: "r"
PHONE_ACTIVITY_RECOGNITION:
TABLE:
ANDROID: plugin_google_activity_recognition
IOS: plugin_ios_activity_recognition
EPISODE_THRESHOLD_BETWEEN_ROWS: 5 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode.
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: ["count", "mostcommonactivity", "countuniqueactivities", "durationstationary", "durationmobile", "durationvehicle"]
ACTIVITY_CLASSES:
STATIONARY: ["still", "tilting"]
MOBILE: ["on_foot", "walking", "running", "on_bicycle"]
VEHICLE: ["in_vehicle"]
SRC_FOLDER: "rapids" # inside src/features/phone_activity_recognition
SRC_LANGUAGE: "python"
PHONE_BATTERY:
TABLE: battery
EPISODE_THRESHOLD_BETWEEN_ROWS: 30 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode.
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: ["countdischarge", "sumdurationdischarge", "countcharge", "sumdurationcharge", "avgconsumptionrate", "maxconsumptionrate"]
SRC_FOLDER: "rapids" # inside src/features/phone_battery
SRC_LANGUAGE: "python"
SRC_FOLDER: "rapids" # inside src/features/phone_messages
PHONE_SCREEN:
TABLE: screen
@ -159,53 +225,14 @@ PHONE_SCREEN:
SRC_FOLDER: "rapids" # inside src/features/phone_screen
SRC_LANGUAGE: "python"
PHONE_LIGHT:
TABLE: light
PHONE_WIFI_CONNECTED:
TABLE: "sensor_wifi"
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"]
SRC_FOLDER: "rapids" # inside src/features/phone_light
SRC_LANGUAGE: "python"
PHONE_ACCELEROMETER:
TABLE: accelerometer
PROVIDERS:
RAPIDS:
COMPUTE: False
FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
SRC_FOLDER: "rapids" # inside src/features/phone_accelerometer
SRC_LANGUAGE: "python"
PANDA:
COMPUTE: False
VALID_SENSED_MINUTES: False
FEATURES:
exertional_activity_episode: ["sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"]
nonexertional_activity_episode: ["sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"]
SRC_FOLDER: "panda" # inside src/features/phone_accelerometer
SRC_LANGUAGE: "python"
PHONE_APPLICATIONS_FOREGROUND:
TABLE: applications_foreground
APPLICATION_CATEGORIES:
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv"
UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
PROVIDERS:
RAPIDS:
COMPUTE: True
SINGLE_CATEGORIES: ["all", "email"]
MULTIPLE_CATEGORIES:
social: ["socialnetworks", "socialmediatools"]
entertainment: ["entertainment", "gamingknowledge", "gamingcasual", "gamingadventure", "gamingstrategy", "gamingtoolscommunity", "gamingroleplaying", "gamingaction", "gaminglogic", "gamingsports", "gamingsimulation"]
SINGLE_APPS: ["top1global", "com.facebook.moments", "com.google.android.youtube", "com.twitter.android"] # There's no entropy for single apps
EXCLUDED_CATEGORIES: ["system_apps"]
EXCLUDED_APPS: ["com.fitbit.FitbitMobile", "com.aware.plugin.upmc.cancer"]
FEATURES: ["count", "timeoffirstuse", "timeoflastuse", "frequencyentropy"]
SRC_FOLDER: "rapids" # inside src/features/phone_applications_foreground
SRC_LANGUAGE: "python"
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
SRC_FOLDER: "rapids" # inside src/features/phone_wifi_connected
SRC_LANGUAGE: "r"
PHONE_WIFI_VISIBLE:
TABLE: "wifi"
@ -216,36 +243,13 @@ PHONE_WIFI_VISIBLE:
SRC_FOLDER: "rapids" # inside src/features/phone_wifi_visible
SRC_LANGUAGE: "r"
PHONE_WIFI_CONNECTED:
TABLE: "sensor_wifi"
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
SRC_FOLDER: "rapids" # inside src/features/phone_wifi_connected
SRC_LANGUAGE: "r"
PHONE_CONVERSATION:
TABLE:
ANDROID: plugin_studentlife_audio_android
IOS: plugin_studentlife_audio
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration",
"sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","noisesumenergy",
"noiseavgenergy","noisesdenergy","noiseminenergy","noisemaxenergy","voicesumenergy",
"voiceavgenergy","voicesdenergy","voiceminenergy","voicemaxenergy","silencesensedfraction","noisesensedfraction",
"voicesensedfraction","unknownsensedfraction","silenceexpectedfraction","noiseexpectedfraction","voiceexpectedfraction",
"unknownexpectedfraction","countconversation"]
RECORDING_MINUTES: 1
PAUSED_MINUTES : 3
SRC_FOLDER: "rapids" # inside src/features/phone_conversation
SRC_LANGUAGE: "python"
############## FITBIT ##########################################################
################################################################################
########################################################################################################################
# FITBIT #
########################################################################################################################
# See https://www.rapids.science/latest/setup/configuration/#device-data-source-configuration
FITBIT_DATA_CONFIGURATION:
SOURCE:
TYPE: DATABASE # DATABASE or FILES (set each [FITBIT_SENSOR][TABLE] attribute with a table name or a file path accordingly)
@ -276,6 +280,17 @@ FITBIT_HEARTRATE_INTRADAY:
SRC_FOLDER: "rapids" # inside src/features/fitbit_heartrate_intraday
SRC_LANGUAGE: "python"
FITBIT_SLEEP_SUMMARY:
TABLE: fitbit_data
SLEEP_EPISODE_TIMESTAMP: end # summary sleep episodes are considered as events based on either the start timestamp or end timestamp.
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: ["countepisode", "avgefficiency", "sumdurationafterwakeup", "sumdurationasleep", "sumdurationawake", "sumdurationtofallasleep", "sumdurationinbed", "avgdurationafterwakeup", "avgdurationasleep", "avgdurationawake", "avgdurationtofallasleep", "avgdurationinbed"]
SLEEP_TYPES: ["main", "nap", "all"]
SRC_FOLDER: "rapids" # inside src/features/fitbit_sleep_summary
SRC_LANGUAGE: "python"
FITBIT_STEPS_SUMMARY:
TABLE: fitbit_data
PROVIDERS:
@ -299,19 +314,11 @@ FITBIT_STEPS_INTRADAY:
SRC_FOLDER: "rapids" # inside src/features/fitbit_steps_intraday
SRC_LANGUAGE: "python"
FITBIT_SLEEP_SUMMARY:
TABLE: fitbit_data
SLEEP_EPISODE_TIMESTAMP: end # summary sleep episodes are considered as events based on either the start timestamp or end timestamp.
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: ["countepisode", "avgefficiency", "sumdurationafterwakeup", "sumdurationasleep", "sumdurationawake", "sumdurationtofallasleep", "sumdurationinbed", "avgdurationafterwakeup", "avgdurationasleep", "avgdurationawake", "avgdurationtofallasleep", "avgdurationinbed"]
SLEEP_TYPES: ["main", "nap", "all"]
SRC_FOLDER: "rapids" # inside src/features/fitbit_sleep_summary
SRC_LANGUAGE: "python"
### Analysis Workflow Example ##################################################
################################################################################
########################################################################################################################
# Analysis Workflow Example #
########################################################################################################################
PARAMS_FOR_ANALYSIS:
CATEGORICAL_OPERATORS: [mostcommon]

View File

@ -79,15 +79,6 @@ rule clean_sensor_features_for_all_participants:
script:
"../src/models/workflow_example/clean_sensor_features.R"
rule merge_features_and_targets_for_individual_model:
input:
cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned.csv",
@ -133,7 +124,7 @@ rule baselines_for_population_model:
script:
"../src/models/workflow_example/baselines.py"
rule modeling_for_individual_participants:
rule modelling_for_individual_participants:
input:
data = "data/processed/models/individual_model/{pid}/input.csv"
params:
@ -151,9 +142,9 @@ rule modeling_for_individual_participants:
log:
"data/processed/models/individual_model/{pid}/output_{cv_method}/{model}/{scaler}/notes.log"
script:
"../src/models/workflow_example/modeling.py"
"../src/models/workflow_example/modelling.py"
rule modeling_for_all_participants:
rule modelling_for_all_participants:
input:
data = "data/processed/models/population_model/input.csv"
params:
@ -171,4 +162,4 @@ rule modeling_for_all_participants:
log:
"data/processed/models/population_model/output_{cv_method}/{model}/{scaler}/notes.log"
script:
"../src/models/workflow_example/modeling.py"
"../src/models/workflow_example/modelling.py"

View File

@ -1,7 +1,7 @@
import numpy as np
import pandas as pd
from statistics import mean
from modeling_utils import getMetrics, createPipeline
from modelling_utils import getMetrics, createPipeline
from sklearn.model_selection import LeaveOneOut

View File

@ -1,6 +1,6 @@
import pandas as pd
import numpy as np
from modeling_utils import getMatchingColNames, getNormAllParticipantsScaler, getMetrics, getFeatureImportances, createPipeline
from modelling_utils import getMatchingColNames, getNormAllParticipantsScaler, getMetrics, getFeatureImportances, createPipeline
from sklearn.model_selection import LeaveOneOut, GridSearchCV