diff --git a/config.yaml b/config.yaml index c40a2e02..a0cacb3d 100644 --- a/config.yaml +++ b/config.yaml @@ -199,8 +199,9 @@ CONVERSATION: IOS: plugin_studentlife_audio DAY_SEGMENTS: *day_segments FEATURES: ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration", - "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","sumenergy", - "avgenergy","sdenergy","minenergy","maxenergy","silencesensedfraction","noisesensedfraction", + "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","noisesumenergy", + "noiseavgenergy","noisesdenergy","noiseminenergy","noisemaxenergy","voicesumenergy", + "voiceavgenergy","voicesdenergy","voiceminenergy","voicemaxenergy","silencesensedfraction","noisesensedfraction", "voicesensedfraction","unknownsensedfraction","silenceexpectedfraction","noiseexpectedfraction","voiceexpectedfraction", "unknownexpectedfraction","countconversation"] RECORDINGMINUTES: 1 diff --git a/docs/features/extracted.rst b/docs/features/extracted.rst index 14f50b59..a99392f5 100644 --- a/docs/features/extracted.rst +++ b/docs/features/extracted.rst @@ -842,11 +842,16 @@ avgconversationduration minutes Average duration of all conversa sdconversationduration minutes Standard Deviation of the duration of all conversations timefirstconversation minutes Minutes since midnight when the first conversation for a day segment was detected timelastconversation minutes Minutes since midnight when the last conversation for a day segment was detected -sumenergy L2-norm Sum of all energy values -avgenergy L2-norm Average of all energy values -sdenergy L2-norm Standard Deviation of all energy values -minenergy L2-norm Minimum of all energy values -maxenergy L2-norm Maximum of all energy values +noisesumenergy L2-norm Sum of all energy values when inference is noise +noiseavgenergy L2-norm Average of all energy values when inference is noise +noisesdenergy L2-norm Standard Deviation of all energy values when inference is noise +noiseminenergy L2-norm Minimum of all energy values when inference is noise +noisemaxenergy L2-norm Maximum of all energy values when inference is noise +voicesumenergy L2-norm Sum of all energy values when inference is voice +voiceavgenergy L2-norm Average of all energy values when inference is voice +voicesdenergy L2-norm Standard Deviation of all energy values when inference is voice +voiceminenergy L2-norm Minimum of all energy values when inference is voice +voicemaxenergy L2-norm Maximum of all energy values when inference is voice silencesensedfraction Ratio between minutessilence and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown) noisesensedfraction Ratio between minutesnoise and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown) voicesensedfraction Ratio between minutesvoice and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown) diff --git a/docs/index.rst b/docs/index.rst index d3805bc9..e3040bed 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -8,6 +8,8 @@ RAPIDS **R**\ eproducible **A**\ nalysis **Pi**\ pline for **D**\ ata **S**\ treams +Do you want to keep up to date with new functionality or have a question? Join the #rapids channel in AWARE Framework's slack_ + Contents: .. toctree:: @@ -36,4 +38,6 @@ Contents: develop/environments develop/contributors develop/testing - develop/test_cases \ No newline at end of file + develop/test_cases + +.. _slack: http://awareframework.com:3000/ diff --git a/docs/usage/example.rst b/docs/usage/example.rst index 5a5149c4..73cf9413 100644 --- a/docs/usage/example.rst +++ b/docs/usage/example.rst @@ -15,12 +15,12 @@ This is a quick guide for creating and running a simple pipeline to analysis an - If you are trying to connect to a local MySQL server from our docker container set your host according to this link_. - You can name your database any way you want, for example ``rapids_example`` - .. code-block:: + .. code-block:: bash [MY_GROUP] user=rapids password=rapids - host=127.0.0.1 # or use host.docker.internal from our docker container + host=127.0.0.1 port=3306 database=rapids_example diff --git a/docs/usage/installation.rst b/docs/usage/installation.rst index 3f8fd01f..ff51b3c4 100644 --- a/docs/usage/installation.rst +++ b/docs/usage/installation.rst @@ -46,10 +46,11 @@ macOS (tested on Catalina 10.15) - ``brew install mysql`` - ``brew services start mysql`` -#. Install R 4.0 and pandoc. If you have other instances of R, we recommend uninstalling them. +#. Install R 4.0, pandoc and rmarkdown. If you have other instances of R, we recommend uninstalling them. - ``brew install r`` - ``brew install pandoc`` + - ``Rscript --vanilla -e 'install.packages("rmarkdown", repos="http://cran.us.r-project.org")'`` #. Install miniconda: @@ -102,9 +103,10 @@ Linux (tested on Ubuntu 18.04 & 20.04) - ``sudo apt update`` - ``sudo apt install r-base`` -#. Install Pandoc +#. Install Pandoc and rmarkdown - ``sudo apt install pandoc`` + - ``Rscript --vanilla -e 'install.packages("rmarkdown", repos="http://cran.us.r-project.org")'`` #. Install GIT diff --git a/docs/usage/introduction.rst b/docs/usage/introduction.rst index bfe98b55..b14d8743 100644 --- a/docs/usage/introduction.rst +++ b/docs/usage/introduction.rst @@ -7,6 +7,8 @@ At the moment, mobile data can be collected using different sensing frameworks ( We recommend reading Snakemake_ docs, but the main idea behind the pipeline is that every link in the analysis chain is a rule with an input and an output. Input and output are files, which can be manipulated using any programming language (although Snakemake_ has wrappers for Julia_, Python_, and R_ that can make development slightly more comfortable). Snakemake_ also allows the pipeline rules to be executed in parallel on multiple cores without any code changes. This can drastically reduce the time needed to complete an analysis. +Do you want to keep up to date with new functionality or have a question? Join the #rapids channel in AWARE Framework's slack_ + Available features: - :ref:`accelerometer-sensor-doc` @@ -39,3 +41,4 @@ We are updating these docs constantly, but if you think something needs clarific .. _Fitbit: https://www.fitbit.com/us/home .. _Python: https://www.python.org/ .. _Julia: https://julialang.org/ +.. _slack: http://awareframework.com:3000/ diff --git a/environment.yml b/environment.yml index 35ae2223..38a15b75 100644 --- a/environment.yml +++ b/environment.yml @@ -1,9 +1,10 @@ name: rapids202007 channels: - - conda-forge - anaconda + - conda-forge - defaults dependencies: + - _py-xgboost-mutex=2.0 - appdirs=1.4.3 - arrow=0.15.2 - asn1crypto=1.2.0 @@ -12,7 +13,7 @@ dependencies: - binaryornot=0.4.4 - blas=1.0 - bzip2=1.0.8 - - ca-certificates=2020.6.24 + - ca-certificates=2020.6.20 - certifi=2020.6.20 - cffi=1.13.1 - chardet=3.0.4 @@ -25,16 +26,22 @@ dependencies: - gitdb2=2.0.6 - gitpython=3.0.4 - idna=2.8 + - imbalanced-learn=0.6.2 - importlib_metadata=0.23 - intel-openmp=2019.4 - jinja2=2.10.3 - jinja2-time=0.2.0 - joblib=0.16.0 - jsonschema=3.1.1 + - libblas=3.8.0 + - libcblas=3.8.0 - libcxx=9.0.0 - libedit=3.1.20181209 - libffi=3.2.1 - libgfortran + - liblapack=3.8.0 + - libxgboost=0.90 + - lightgbm=2.3.0 - llvm-openmp=10.0.0 - markupsafe=1.1.1 - mkl=2019.4 @@ -52,11 +59,13 @@ dependencies: - plotly=4.2.1 - poyo=0.5.0 - psutil=5.6.3 + - py-xgboost=0.90 - pycparser=2.19 - pyopenssl=19.0.0 - pysocks=1.7.1 - python=3.7.3 - python-dateutil=2.8.0 + - python_abi=3.7 - pytz=2019.3 - pyyaml=5.1.2 - readline=8.0 @@ -73,6 +82,7 @@ dependencies: - wheel=0.33.6 - whichcraft=0.6.1 - wrapt=1.11.2 + - xgboost=0.90 - xz=5.2.4 - yaml=0.1.7 - zipp=0.6.0 diff --git a/example_profile/example_config.yaml b/example_profile/example_config.yaml index 8a57056d..ff59e3cb 100644 --- a/example_profile/example_config.yaml +++ b/example_profile/example_config.yaml @@ -311,9 +311,3 @@ PARAMS_FOR_ANALYSIS: {"clf__learning_rate": [0.01, 0.1, 1], "clf__n_estimators": [5, 10, 100, 200], "clf__num_leaves": [5, 16, 31, 62]} LightGBM: {"clf__learning_rate": [0.01, 0.1, 1], "clf__n_estimators": [5, 10, 100, 200], "clf__num_leaves": [5, 16, 31, 62]} - - - # Target Settings: - # 1 => TARGETS_RATIO_THRESHOLD (ceiling) or more of available CESD scores were TARGETS_VALUE_THRESHOLD or higher; 0 => otherwise - TARGETS_RATIO_THRESHOLD: 0.5 - TARGETS_VALUE_THRESHOLD: 16 diff --git a/rules/models.smk b/rules/models.smk index 287fdf35..ce6fcba6 100644 --- a/rules/models.smk +++ b/rules/models.smk @@ -17,9 +17,7 @@ rule targets: participant_info = "data/raw/{pid}/" + config["PARAMS_FOR_ANALYSIS"]["TARGET_TABLE"] + "_raw.csv" params: pid = "{pid}", - summarised = "{summarised}", - targets_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["TARGETS_RATIO_THRESHOLD"], - targets_value_threshold = config["PARAMS_FOR_ANALYSIS"]["TARGETS_VALUE_THRESHOLD"] + summarised = "{summarised}" output: "data/processed/{pid}/targets_{summarised}.csv" script: diff --git a/src/data/restore_sql_file.py b/src/data/restore_sql_file.py index dfa14011..d6b26e9b 100644 --- a/src/data/restore_sql_file.py +++ b/src/data/restore_sql_file.py @@ -1,5 +1,6 @@ import pandas as pd import configparser +import subprocess import os # read database credentials @@ -8,14 +9,20 @@ config = configparser.ConfigParser() config.read(snakemake.input["db_credentials"]) # bash command to create table and restore tables from sql file -checkdb_cmd = "mysql -h " + config[group]["host"] + " -u " + config[group]["user"] + " -p" + config[group]["password"] + " -e \"use " + config[group]["database"] + "\"" +checkdb_cmd = "mysql -h " + config[group]["host"] + " -u " + config[group]["user"] + " -p" + config[group]["password"] + " -e use " + config[group]["database"] create_cmd = "mysql -h " + config[group]["host"] + " -u " + config[group]["user"] + " -p" + config[group]["password"] + " -e \"CREATE DATABASE IF NOT EXISTS " + config[group]["database"] + ";\"" -restore_cmd = "mysql -h " + config[group]["host"] + " -u " + config[group]["user"] + " -p" + config[group]["password"] + " " + config[group]["database"] + " < data/external/" + config[group]["database"] + ".sql" +restore_cmd = "mysql -h " + config[group]["host"] + " -u " + config[group]["user"] + " -p" + config[group]["password"] + " " + config[group]["database"] + " < data/external/rapids_example.sql" try: - os.system(checkdb_cmd) -except: - print(config[group]["database"] + " DB already exists.") -else: + print("Checking if " + config[group]["database"] + " database exists") + subprocess.run(checkdb_cmd.split(), check = True, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) +except subprocess.CalledProcessError: + print(config[group]["database"] + " database does not exist") + print("Creating " + config[group]["database"] + " database") os.system(create_cmd) + print(config[group]["database"] + " database created") + print("Restoring rapids_example.sql") os.system(restore_cmd) + print("rapids_example.sql restored in " + config[group]["database"] + " database") +else: + raise ValueError(config[group]["database"] + " DB already exists") diff --git a/src/features/conversation/conversation_base.py b/src/features/conversation/conversation_base.py index eb00c64b..6c50919a 100644 --- a/src/features/conversation/conversation_base.py +++ b/src/features/conversation/conversation_base.py @@ -3,8 +3,9 @@ import pandas as pd def base_conversation_features(conversation_data, day_segment, requested_features,recordingMinutes,pausedMinutes,expectedMinutes): # name of the features this function can compute base_features_names = ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration", - "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","sumenergy", - "avgenergy","sdenergy","minenergy","maxenergy","silencesensedfraction","noisesensedfraction", + "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","noisesumenergy", + "noiseavgenergy","noisesdenergy","noiseminenergy","noisemaxenergy","voicesumenergy", + "voiceavgenergy","voicesdenergy","voiceminenergy","voicemaxenergy","silencesensedfraction","noisesensedfraction", "voicesensedfraction","unknownsensedfraction","silenceexpectedfraction","noiseexpectedfraction","voiceexpectedfraction", "unknownexpectedfraction","countconversation"] @@ -96,22 +97,36 @@ def base_conversation_features(conversation_data, day_segment, requested_feature else: conversation_features["conversation_" + day_segment + "_timelastconversation"] = 0 - if "sumenergy" in features_to_compute: - conversation_features["conversation_" + day_segment + "_sumenergy"] = conversation_data.groupby(["local_date"])["double_energy"].sum() + if "noisesumenergy" in features_to_compute: + conversation_features["conversation_" + day_segment + "_noisesumenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_date"])["double_energy"].sum() - if "avgenergy" in features_to_compute: - conversation_features["conversation_" + day_segment + "_avgenergy"] = conversation_data.groupby(["local_date"])["double_energy"].mean() + if "noiseavgenergy" in features_to_compute: + conversation_features["conversation_" + day_segment + "_noiseavgenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_date"])["double_energy"].mean() - if "sdenergy" in features_to_compute: - conversation_features["conversation_" + day_segment + "_sdenergy"] = conversation_data.groupby(["local_date"])["double_energy"].std() + if "noisesdenergy" in features_to_compute: + conversation_features["conversation_" + day_segment + "_noisesdenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_date"])["double_energy"].std() - if "minenergy" in features_to_compute: - conversation_features["conversation_" + day_segment + "_minenergy"] = conversation_data.groupby(["local_date"])["double_energy"].min() + if "noiseminenergy" in features_to_compute: + conversation_features["conversation_" + day_segment + "_noiseminenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_date"])["double_energy"].min() - if "maxenergy" in features_to_compute: - conversation_features["conversation_" + day_segment + "_maxenergy"] = conversation_data.groupby(["local_date"])["double_energy"].max() + if "noisemaxenergy" in features_to_compute: + conversation_features["conversation_" + day_segment + "_noisemaxenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_date"])["double_energy"].max() + if "voicesumenergy" in features_to_compute: + conversation_features["conversation_" + day_segment + "_voicesumenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_date"])["double_energy"].sum() + + if "voiceavgenergy" in features_to_compute: + conversation_features["conversation_" + day_segment + "_voiceavgenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_date"])["double_energy"].mean() + + if "voicesdenergy" in features_to_compute: + conversation_features["conversation_" + day_segment + "_voicesdenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_date"])["double_energy"].std() + + if "voiceminenergy" in features_to_compute: + conversation_features["conversation_" + day_segment + "_voiceminenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_date"])["double_energy"].min() + + if "voicemaxenergy" in features_to_compute: + conversation_features["conversation_" + day_segment + "_voicemaxenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_date"])["double_energy"].max() conversation_features = conversation_features.reset_index() - + return conversation_features \ No newline at end of file diff --git a/src/models/modeling_utils.py b/src/models/modeling_utils.py index 84a66188..1ba17c50 100644 --- a/src/models/modeling_utils.py +++ b/src/models/modeling_utils.py @@ -49,7 +49,10 @@ def getMetrics(pred_y, pred_y_prob, true_y): metrics = {} # metrics for all categories metrics["accuracy"] = accuracy_score(true_y, pred_y) - metrics["auc"] = roc_auc_score(true_y, pred_y_prob) + try: + metrics["auc"] = roc_auc_score(true_y, pred_y_prob) + except: + metrics["auc"] = None metrics["kappa"] = cohen_kappa_score(true_y, pred_y) # metrics for label 0 metrics["precision0"] = precision_score(true_y, pred_y, average=None, labels=[0,1], zero_division=0)[0] diff --git a/src/models/targets.py b/src/models/targets.py index e3ebcdd8..e12794d4 100644 --- a/src/models/targets.py +++ b/src/models/targets.py @@ -3,19 +3,10 @@ import numpy as np pid = snakemake.params["pid"] summarised = snakemake.params["summarised"] -targets_ratio_threshold = snakemake.params["targets_ratio_threshold"] -targets_value_threshold = snakemake.params["targets_value_threshold"] participant_info = pd.read_csv(snakemake.input["participant_info"]) if summarised == "summarised": - targets = pd.DataFrame(columns=["pid", "target"]) - - if not participant_info.empty: - cesds = participant_info.loc[0, ["preop_cesd_total", "inpatient_cesd_total", "postop_cesd_total", "3month_cesd_total"]] - # targets: 1 => 50% (ceiling) or more of available CESD scores were 16 or higher; 0 => otherwise - num_threshold = int((cesds.count() + 1) * targets_ratio_threshold) - target = 1 if cesds.apply(lambda x : 1 if x >= targets_value_threshold else 0).sum() >= num_threshold else 0 - targets.loc[0, :] = [pid, target] + raise ValueError("Do not support summarised features for example dataset.") elif summarised == "notsummarised": targets = participant_info[["local_date", "target"]]