Merge branch 'master' of https://github.com/carissalow/rapids into docker

2020-08-28 15:18:42 -04:00 · 2020-08-28 15:18:42 -04:00 · 29eb56155e
parent 1a321bb2fe cbaf129494
commit 29eb56155e
13 changed files with 86 additions and 53 deletions
--- a/config.yaml
+++ b/config.yaml
@ -199,8 +199,9 @@ CONVERSATION:
    IOS: plugin_studentlife_audio
  DAY_SEGMENTS: *day_segments
  FEATURES: ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration",
-    "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","sumenergy",
+    "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","noisesumenergy",
-    "avgenergy","sdenergy","minenergy","maxenergy","silencesensedfraction","noisesensedfraction",
+    "noiseavgenergy","noisesdenergy","noiseminenergy","noisemaxenergy","voicesumenergy",
    "voiceavgenergy","voicesdenergy","voiceminenergy","voicemaxenergy","silencesensedfraction","noisesensedfraction",
    "voicesensedfraction","unknownsensedfraction","silenceexpectedfraction","noiseexpectedfraction","voiceexpectedfraction",
    "unknownexpectedfraction","countconversation"]
  RECORDINGMINUTES: 1
--- a/docs/features/extracted.rst
+++ b/docs/features/extracted.rst
@ -842,11 +842,16 @@ avgconversationduration     minutes             Average duration of all conversa
 sdconversationduration      minutes             Standard Deviation of the duration of all conversations
 timefirstconversation       minutes             Minutes since midnight when the first conversation for a day segment was detected
 timelastconversation        minutes             Minutes since midnight when the last conversation for a day segment was detected
-sumenergy                   L2-norm             Sum of all energy values
+noisesumenergy              L2-norm             Sum of all energy values when inference is noise
-avgenergy                   L2-norm             Average of all energy values
+noiseavgenergy              L2-norm             Average of all energy values when inference is noise
-sdenergy                    L2-norm             Standard Deviation of all energy values
+noisesdenergy               L2-norm             Standard Deviation of all energy values when inference is noise
-minenergy                   L2-norm             Minimum of all energy values
+noiseminenergy              L2-norm             Minimum of all energy values when inference is noise
-maxenergy                   L2-norm             Maximum of all energy values
+noisemaxenergy              L2-norm             Maximum of all energy values when inference is noise
 voicesumenergy              L2-norm             Sum of all energy values when inference is voice
 voiceavgenergy              L2-norm             Average of all energy values when inference is voice
 voicesdenergy               L2-norm             Standard Deviation of all energy values when inference is voice
 voiceminenergy              L2-norm             Minimum of all energy values when inference is voice
 voicemaxenergy              L2-norm             Maximum of all energy values when inference is voice
 silencesensedfraction                           Ratio between minutessilence and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown)
 noisesensedfraction                             Ratio between minutesnoise and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown)
 voicesensedfraction                             Ratio between minutesvoice and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown)
--- a/docs/index.rst
+++ b/docs/index.rst
@ -8,6 +8,8 @@ RAPIDS
 **R**\ eproducible **A**\ nalysis **Pi**\ pline for **D**\ ata **S**\ treams
 Do you want to keep up to date with new functionality or have a question? Join the #rapids channel in AWARE Framework's slack_
 Contents:
 .. toctree::
@ -37,3 +39,5 @@ Contents:
   develop/contributors
   develop/testing
   develop/test_cases
 .. _slack: http://awareframework.com:3000/
--- a/docs/usage/example.rst
+++ b/docs/usage/example.rst
@ -15,12 +15,12 @@ This is a quick guide for creating and running a simple pipeline to analysis an
    - If you are trying to connect to a local MySQL server from our docker container set your host according to this link_.
    - You can name your database any way you want, for example ``rapids_example``
-    .. code-block::
+    .. code-block:: bash
        [MY_GROUP]
        user=rapids
        password=rapids
-        host=127.0.0.1 # or use host.docker.internal from our docker container
+        host=127.0.0.1
        port=3306
        database=rapids_example
--- a/docs/usage/installation.rst
+++ b/docs/usage/installation.rst
@ -46,10 +46,11 @@ macOS (tested on Catalina 10.15)
    - ``brew install mysql``
    - ``brew services start mysql``
-#. Install R 4.0 and pandoc. If you have other instances of R, we recommend uninstalling them.
+#. Install R 4.0, pandoc and rmarkdown. If you have other instances of R, we recommend uninstalling them.
    - ``brew install r``
    - ``brew install pandoc``
    - ``Rscript --vanilla -e 'install.packages("rmarkdown", repos="http://cran.us.r-project.org")'``
 #. Install miniconda:
@ -102,9 +103,10 @@ Linux (tested on Ubuntu 18.04 & 20.04)
    - ``sudo apt update``
    - ``sudo apt install r-base``
-#. Install Pandoc
+#. Install Pandoc and rmarkdown
    - ``sudo apt install pandoc``
    - ``Rscript --vanilla -e 'install.packages("rmarkdown", repos="http://cran.us.r-project.org")'``
 #. Install GIT
--- a/docs/usage/introduction.rst
+++ b/docs/usage/introduction.rst
@ -7,6 +7,8 @@ At the moment, mobile data can be collected using different sensing frameworks (
 We recommend reading Snakemake_ docs, but the main idea behind the pipeline is that every link in the analysis chain is a rule with an input and an output. Input and output are files, which can be manipulated using any programming language (although Snakemake_ has wrappers for Julia_, Python_, and R_ that can make development slightly more comfortable). Snakemake_ also allows the pipeline rules to be executed in parallel on multiple cores without any code changes. This can drastically reduce the time needed to complete an analysis.
 Do you want to keep up to date with new functionality or have a question? Join the #rapids channel in AWARE Framework's slack_
 Available features:
 - :ref:`accelerometer-sensor-doc`
@ -39,3 +41,4 @@ We are updating these docs constantly, but if you think something needs clarific
 .. _Fitbit: https://www.fitbit.com/us/home
 .. _Python: https://www.python.org/
 .. _Julia: https://julialang.org/
 .. _slack: http://awareframework.com:3000/
--- a/environment.yml
+++ b/environment.yml
@ -1,9 +1,10 @@
 name: rapids202007
 channels:
  - conda-forge
  - anaconda
  - conda-forge
  - defaults
 dependencies:
  - _py-xgboost-mutex=2.0
  - appdirs=1.4.3
  - arrow=0.15.2
  - asn1crypto=1.2.0
@ -12,7 +13,7 @@ dependencies:
  - binaryornot=0.4.4
  - blas=1.0
  - bzip2=1.0.8
-  - ca-certificates=2020.6.24
+  - ca-certificates=2020.6.20
  - certifi=2020.6.20
  - cffi=1.13.1
  - chardet=3.0.4
@ -25,16 +26,22 @@ dependencies:
  - gitdb2=2.0.6
  - gitpython=3.0.4
  - idna=2.8
  - imbalanced-learn=0.6.2
  - importlib_metadata=0.23
  - intel-openmp=2019.4
  - jinja2=2.10.3
  - jinja2-time=0.2.0
  - joblib=0.16.0
  - jsonschema=3.1.1
  - libblas=3.8.0
  - libcblas=3.8.0
  - libcxx=9.0.0
  - libedit=3.1.20181209
  - libffi=3.2.1
  - libgfortran
  - liblapack=3.8.0
  - libxgboost=0.90
  - lightgbm=2.3.0
  - llvm-openmp=10.0.0
  - markupsafe=1.1.1
  - mkl=2019.4
@ -52,11 +59,13 @@ dependencies:
  - plotly=4.2.1
  - poyo=0.5.0
  - psutil=5.6.3
  - py-xgboost=0.90
  - pycparser=2.19
  - pyopenssl=19.0.0
  - pysocks=1.7.1
  - python=3.7.3
  - python-dateutil=2.8.0
  - python_abi=3.7
  - pytz=2019.3
  - pyyaml=5.1.2
  - readline=8.0
@ -73,6 +82,7 @@ dependencies:
  - wheel=0.33.6
  - whichcraft=0.6.1
  - wrapt=1.11.2
  - xgboost=0.90
  - xz=5.2.4
  - yaml=0.1.7
  - zipp=0.6.0
--- a/example_profile/example_config.yaml
+++ b/example_profile/example_config.yaml
@ -311,9 +311,3 @@ PARAMS_FOR_ANALYSIS:
      {"clf__learning_rate": [0.01, 0.1, 1], "clf__n_estimators": [5, 10, 100, 200], "clf__num_leaves": [5, 16, 31, 62]}
    LightGBM:
      {"clf__learning_rate": [0.01, 0.1, 1], "clf__n_estimators": [5, 10, 100, 200], "clf__num_leaves": [5, 16, 31, 62]}
  # Target Settings:
  # 1 => TARGETS_RATIO_THRESHOLD (ceiling) or more of available CESD scores were TARGETS_VALUE_THRESHOLD or higher; 0 => otherwise
  TARGETS_RATIO_THRESHOLD: 0.5
  TARGETS_VALUE_THRESHOLD: 16
--- a/rules/models.smk
+++ b/rules/models.smk
@ -17,9 +17,7 @@ rule targets:
        participant_info = "data/raw/{pid}/" + config["PARAMS_FOR_ANALYSIS"]["TARGET_TABLE"] + "_raw.csv"
    params:
        pid = "{pid}",
-        summarised = "{summarised}",
+        summarised = "{summarised}"
        targets_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["TARGETS_RATIO_THRESHOLD"],
        targets_value_threshold = config["PARAMS_FOR_ANALYSIS"]["TARGETS_VALUE_THRESHOLD"]
    output:
        "data/processed/{pid}/targets_{summarised}.csv"
    script:
--- a/src/data/restore_sql_file.py
+++ b/src/data/restore_sql_file.py
@ -1,5 +1,6 @@
 import pandas as pd
 import configparser
 import subprocess
 import os
 # read database credentials
@ -8,14 +9,20 @@ config = configparser.ConfigParser()
 config.read(snakemake.input["db_credentials"])
 # bash command to create table and restore tables from sql file
-checkdb_cmd = "mysql -h " + config[group]["host"] + " -u " + config[group]["user"] + " -p" + config[group]["password"] + " -e \"use " + config[group]["database"] + "\""
+checkdb_cmd = "mysql -h " + config[group]["host"] + " -u " + config[group]["user"] + " -p" + config[group]["password"] + " -e use " + config[group]["database"]
 create_cmd = "mysql -h " + config[group]["host"] + " -u " + config[group]["user"] + " -p" + config[group]["password"] + " -e \"CREATE DATABASE IF NOT EXISTS " + config[group]["database"] + ";\""
-restore_cmd = "mysql -h " + config[group]["host"] + " -u " + config[group]["user"] + " -p" + config[group]["password"] + " " + config[group]["database"] + " < data/external/" + config[group]["database"] + ".sql"
+restore_cmd = "mysql -h " + config[group]["host"] + " -u " + config[group]["user"] + " -p" + config[group]["password"] + " " + config[group]["database"] + " < data/external/rapids_example.sql"
 try:
-    os.system(checkdb_cmd)
+    print("Checking if " + config[group]["database"] + " database exists")
-except:
+    subprocess.run(checkdb_cmd.split(), check = True, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
-    print(config[group]["database"] + " DB already exists.")
+except subprocess.CalledProcessError:
-else:
+    print(config[group]["database"] + " database does not exist")
    print("Creating " + config[group]["database"] + " database")
    os.system(create_cmd)
    print(config[group]["database"] + " database created")
    print("Restoring rapids_example.sql")
    os.system(restore_cmd)
    print("rapids_example.sql restored in " + config[group]["database"] + " database")
 else:
    raise ValueError(config[group]["database"] + " DB already exists")
--- a/src/features/conversation/conversation_base.py
+++ b/src/features/conversation/conversation_base.py
@ -3,8 +3,9 @@ import pandas as pd
 def base_conversation_features(conversation_data, day_segment, requested_features,recordingMinutes,pausedMinutes,expectedMinutes):
    # name of the features this function can compute
    base_features_names = ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration",
-    "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","sumenergy",
+    "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","noisesumenergy",
-    "avgenergy","sdenergy","minenergy","maxenergy","silencesensedfraction","noisesensedfraction",
+    "noiseavgenergy","noisesdenergy","noiseminenergy","noisemaxenergy","voicesumenergy",
    "voiceavgenergy","voicesdenergy","voiceminenergy","voicemaxenergy","silencesensedfraction","noisesensedfraction",
    "voicesensedfraction","unknownsensedfraction","silenceexpectedfraction","noiseexpectedfraction","voiceexpectedfraction",
    "unknownexpectedfraction","countconversation"]
@ -96,21 +97,35 @@ def base_conversation_features(conversation_data, day_segment, requested_feature
                else:
                    conversation_features["conversation_" + day_segment + "_timelastconversation"] = 0
-            if "sumenergy" in features_to_compute:
+            if "noisesumenergy" in features_to_compute:
-                conversation_features["conversation_" + day_segment + "_sumenergy"] = conversation_data.groupby(["local_date"])["double_energy"].sum()
+                conversation_features["conversation_" + day_segment + "_noisesumenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_date"])["double_energy"].sum()
-            if "avgenergy" in features_to_compute:
+            if "noiseavgenergy" in features_to_compute:
-                conversation_features["conversation_" + day_segment + "_avgenergy"] = conversation_data.groupby(["local_date"])["double_energy"].mean()
+                conversation_features["conversation_" + day_segment + "_noiseavgenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_date"])["double_energy"].mean()
-            if "sdenergy" in features_to_compute:
+            if "noisesdenergy" in features_to_compute:
-                conversation_features["conversation_" + day_segment + "_sdenergy"] = conversation_data.groupby(["local_date"])["double_energy"].std()
+                conversation_features["conversation_" + day_segment + "_noisesdenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_date"])["double_energy"].std()
-            if "minenergy" in features_to_compute:
+            if "noiseminenergy" in features_to_compute:
-                conversation_features["conversation_" + day_segment + "_minenergy"] = conversation_data.groupby(["local_date"])["double_energy"].min()
+                conversation_features["conversation_" + day_segment + "_noiseminenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_date"])["double_energy"].min()
-            if "maxenergy" in features_to_compute:
+            if "noisemaxenergy" in features_to_compute:
-                conversation_features["conversation_" + day_segment + "_maxenergy"] = conversation_data.groupby(["local_date"])["double_energy"].max()
+                conversation_features["conversation_" + day_segment + "_noisemaxenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_date"])["double_energy"].max()
            if "voicesumenergy" in features_to_compute:
                conversation_features["conversation_" + day_segment + "_voicesumenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_date"])["double_energy"].sum()
            if "voiceavgenergy" in features_to_compute:
                conversation_features["conversation_" + day_segment + "_voiceavgenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_date"])["double_energy"].mean()
            if "voicesdenergy" in features_to_compute:
                conversation_features["conversation_" + day_segment + "_voicesdenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_date"])["double_energy"].std()
            if "voiceminenergy" in features_to_compute:
                conversation_features["conversation_" + day_segment + "_voiceminenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_date"])["double_energy"].min()
            if "voicemaxenergy" in features_to_compute:
                conversation_features["conversation_" + day_segment + "_voicemaxenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_date"])["double_energy"].max()
            conversation_features = conversation_features.reset_index()
--- a/src/models/modeling_utils.py
+++ b/src/models/modeling_utils.py
@ -49,7 +49,10 @@ def getMetrics(pred_y, pred_y_prob, true_y):
    metrics = {}
    # metrics for all categories
    metrics["accuracy"] = accuracy_score(true_y, pred_y)
    try:
        metrics["auc"] = roc_auc_score(true_y, pred_y_prob)
    except:
        metrics["auc"] = None
    metrics["kappa"] = cohen_kappa_score(true_y, pred_y)
    # metrics for label 0
    metrics["precision0"] = precision_score(true_y, pred_y, average=None, labels=[0,1], zero_division=0)[0]
--- a/src/models/targets.py
+++ b/src/models/targets.py
@ -3,19 +3,10 @@ import numpy as np
 pid = snakemake.params["pid"]
 summarised = snakemake.params["summarised"]
 targets_ratio_threshold = snakemake.params["targets_ratio_threshold"]
 targets_value_threshold = snakemake.params["targets_value_threshold"]
 participant_info = pd.read_csv(snakemake.input["participant_info"])
 if summarised == "summarised":
-    targets = pd.DataFrame(columns=["pid", "target"])
+    raise ValueError("Do not support summarised features for example dataset.")
    if not participant_info.empty:
        cesds = participant_info.loc[0, ["preop_cesd_total", "inpatient_cesd_total", "postop_cesd_total", "3month_cesd_total"]]
        # targets: 1 => 50% (ceiling) or more of available CESD scores were 16 or higher; 0 => otherwise
        num_threshold = int((cesds.count() + 1) * targets_ratio_threshold)
        target = 1 if cesds.apply(lambda x : 1 if x >= targets_value_threshold else 0).sum() >= num_threshold else 0
        targets.loc[0, :] = [pid, target]
 elif summarised == "notsummarised":
    targets = participant_info[["local_date", "target"]]