Merge branch 'master' of https://github.com/carissalow/rapids into docker

2020-08-28 15:18:42 -04:00 · 2020-08-28 15:18:42 -04:00 · 29eb56155e
parent 1a321bb2fe cbaf129494
commit 29eb56155e
13 changed files with 86 additions and 53 deletions
--- a/config.yaml
+++ b/config.yaml
@ -199,8 +199,9 @@ CONVERSATION:
    IOS: plugin_studentlife_audio
  DAY_SEGMENTS: *day_segments
  FEATURES: ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration",
-    "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","sumenergy",
-    "avgenergy","sdenergy","minenergy","maxenergy","silencesensedfraction","noisesensedfraction",
+    "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","noisesumenergy",
+    "noiseavgenergy","noisesdenergy","noiseminenergy","noisemaxenergy","voicesumenergy",
+    "voiceavgenergy","voicesdenergy","voiceminenergy","voicemaxenergy","silencesensedfraction","noisesensedfraction",
    "voicesensedfraction","unknownsensedfraction","silenceexpectedfraction","noiseexpectedfraction","voiceexpectedfraction",
    "unknownexpectedfraction","countconversation"]
  RECORDINGMINUTES: 1
--- a/docs/features/extracted.rst
+++ b/docs/features/extracted.rst
@ -842,11 +842,16 @@ avgconversationduration     minutes             Average duration of all conversa
 sdconversationduration      minutes             Standard Deviation of the duration of all conversations
 timefirstconversation       minutes             Minutes since midnight when the first conversation for a day segment was detected
 timelastconversation        minutes             Minutes since midnight when the last conversation for a day segment was detected
-sumenergy                   L2-norm             Sum of all energy values
-avgenergy                   L2-norm             Average of all energy values
-sdenergy                    L2-norm             Standard Deviation of all energy values
-minenergy                   L2-norm             Minimum of all energy values
-maxenergy                   L2-norm             Maximum of all energy values
+noisesumenergy              L2-norm             Sum of all energy values when inference is noise
+noiseavgenergy              L2-norm             Average of all energy values when inference is noise
+noisesdenergy               L2-norm             Standard Deviation of all energy values when inference is noise
+noiseminenergy              L2-norm             Minimum of all energy values when inference is noise
+noisemaxenergy              L2-norm             Maximum of all energy values when inference is noise
+voicesumenergy              L2-norm             Sum of all energy values when inference is voice
+voiceavgenergy              L2-norm             Average of all energy values when inference is voice
+voicesdenergy               L2-norm             Standard Deviation of all energy values when inference is voice
+voiceminenergy              L2-norm             Minimum of all energy values when inference is voice
+voicemaxenergy              L2-norm             Maximum of all energy values when inference is voice
 silencesensedfraction                           Ratio between minutessilence and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown)
 noisesensedfraction                             Ratio between minutesnoise and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown)
 voicesensedfraction                             Ratio between minutesvoice and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown)
--- a/docs/index.rst
+++ b/docs/index.rst
@ -8,6 +8,8 @@ RAPIDS

 **R**\ eproducible **A**\ nalysis **Pi**\ pline for **D**\ ata **S**\ treams

+Do you want to keep up to date with new functionality or have a question? Join the #rapids channel in AWARE Framework's slack_
+
 Contents:

 .. toctree::
@ -36,4 +38,6 @@ Contents:
   develop/environments
   develop/contributors
   develop/testing
-   develop/test_cases
+   develop/test_cases
+   
+.. _slack: http://awareframework.com:3000/
--- a/docs/usage/example.rst
+++ b/docs/usage/example.rst
@ -15,12 +15,12 @@ This is a quick guide for creating and running a simple pipeline to analysis an
    - If you are trying to connect to a local MySQL server from our docker container set your host according to this link_.
    - You can name your database any way you want, for example ``rapids_example``
    
-    .. code-block::
+    .. code-block:: bash

        [MY_GROUP]
        user=rapids
        password=rapids
-        host=127.0.0.1 # or use host.docker.internal from our docker container
+        host=127.0.0.1
        port=3306
        database=rapids_example

--- a/docs/usage/installation.rst
+++ b/docs/usage/installation.rst
@ -46,10 +46,11 @@ macOS (tested on Catalina 10.15)
    - ``brew install mysql``
    - ``brew services start mysql``

-#. Install R 4.0 and pandoc. If you have other instances of R, we recommend uninstalling them.
+#. Install R 4.0, pandoc and rmarkdown. If you have other instances of R, we recommend uninstalling them.

    - ``brew install r``
    - ``brew install pandoc``
+    - ``Rscript --vanilla -e 'install.packages("rmarkdown", repos="http://cran.us.r-project.org")'``

 #. Install miniconda:

@ -102,9 +103,10 @@ Linux (tested on Ubuntu 18.04 & 20.04)
    - ``sudo apt update``
    - ``sudo apt install r-base``

-#. Install Pandoc
+#. Install Pandoc and rmarkdown

    - ``sudo apt install pandoc``
+    - ``Rscript --vanilla -e 'install.packages("rmarkdown", repos="http://cran.us.r-project.org")'``

 #. Install GIT

--- a/docs/usage/introduction.rst
+++ b/docs/usage/introduction.rst
@ -7,6 +7,8 @@ At the moment, mobile data can be collected using different sensing frameworks (

 We recommend reading Snakemake_ docs, but the main idea behind the pipeline is that every link in the analysis chain is a rule with an input and an output. Input and output are files, which can be manipulated using any programming language (although Snakemake_ has wrappers for Julia_, Python_, and R_ that can make development slightly more comfortable). Snakemake_ also allows the pipeline rules to be executed in parallel on multiple cores without any code changes. This can drastically reduce the time needed to complete an analysis.

+Do you want to keep up to date with new functionality or have a question? Join the #rapids channel in AWARE Framework's slack_
+
 Available features:

 - :ref:`accelerometer-sensor-doc`
@ -39,3 +41,4 @@ We are updating these docs constantly, but if you think something needs clarific
 .. _Fitbit: https://www.fitbit.com/us/home
 .. _Python: https://www.python.org/
 .. _Julia: https://julialang.org/
+.. _slack: http://awareframework.com:3000/
--- a/environment.yml
+++ b/environment.yml
@ -1,9 +1,10 @@
 name: rapids202007
 channels:
-  - conda-forge
  - anaconda
+  - conda-forge
  - defaults
 dependencies:
+  - _py-xgboost-mutex=2.0
  - appdirs=1.4.3
  - arrow=0.15.2
  - asn1crypto=1.2.0
@ -12,7 +13,7 @@ dependencies:
  - binaryornot=0.4.4
  - blas=1.0
  - bzip2=1.0.8
-  - ca-certificates=2020.6.24
+  - ca-certificates=2020.6.20
  - certifi=2020.6.20
  - cffi=1.13.1
  - chardet=3.0.4
@ -25,16 +26,22 @@ dependencies:
  - gitdb2=2.0.6
  - gitpython=3.0.4
  - idna=2.8
+  - imbalanced-learn=0.6.2
  - importlib_metadata=0.23
  - intel-openmp=2019.4
  - jinja2=2.10.3
  - jinja2-time=0.2.0
  - joblib=0.16.0
  - jsonschema=3.1.1
+  - libblas=3.8.0
+  - libcblas=3.8.0
  - libcxx=9.0.0
  - libedit=3.1.20181209
  - libffi=3.2.1
  - libgfortran
+  - liblapack=3.8.0
+  - libxgboost=0.90
+  - lightgbm=2.3.0
  - llvm-openmp=10.0.0
  - markupsafe=1.1.1
  - mkl=2019.4
@ -52,11 +59,13 @@ dependencies:
  - plotly=4.2.1
  - poyo=0.5.0
  - psutil=5.6.3
+  - py-xgboost=0.90
  - pycparser=2.19
  - pyopenssl=19.0.0
  - pysocks=1.7.1
  - python=3.7.3
  - python-dateutil=2.8.0
+  - python_abi=3.7
  - pytz=2019.3
  - pyyaml=5.1.2
  - readline=8.0
@ -73,6 +82,7 @@ dependencies:
  - wheel=0.33.6
  - whichcraft=0.6.1
  - wrapt=1.11.2
+  - xgboost=0.90
  - xz=5.2.4
  - yaml=0.1.7
  - zipp=0.6.0
--- a/example_profile/example_config.yaml
+++ b/example_profile/example_config.yaml
@ -311,9 +311,3 @@ PARAMS_FOR_ANALYSIS:
      {"clf__learning_rate": [0.01, 0.1, 1], "clf__n_estimators": [5, 10, 100, 200], "clf__num_leaves": [5, 16, 31, 62]}
    LightGBM:
      {"clf__learning_rate": [0.01, 0.1, 1], "clf__n_estimators": [5, 10, 100, 200], "clf__num_leaves": [5, 16, 31, 62]}
-
-
-  # Target Settings:
-  # 1 => TARGETS_RATIO_THRESHOLD (ceiling) or more of available CESD scores were TARGETS_VALUE_THRESHOLD or higher; 0 => otherwise
-  TARGETS_RATIO_THRESHOLD: 0.5
-  TARGETS_VALUE_THRESHOLD: 16
--- a/rules/models.smk
+++ b/rules/models.smk
@ -17,9 +17,7 @@ rule targets:
        participant_info = "data/raw/{pid}/" + config["PARAMS_FOR_ANALYSIS"]["TARGET_TABLE"] + "_raw.csv"
    params:
        pid = "{pid}",
-        summarised = "{summarised}",
-        targets_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["TARGETS_RATIO_THRESHOLD"],
-        targets_value_threshold = config["PARAMS_FOR_ANALYSIS"]["TARGETS_VALUE_THRESHOLD"]
+        summarised = "{summarised}"
    output:
        "data/processed/{pid}/targets_{summarised}.csv"
    script:
--- a/src/data/restore_sql_file.py
+++ b/src/data/restore_sql_file.py
@ -1,5 +1,6 @@
 import pandas as pd
 import configparser
+import subprocess
 import os

 # read database credentials
@ -8,14 +9,20 @@ config = configparser.ConfigParser()
 config.read(snakemake.input["db_credentials"])

 # bash command to create table and restore tables from sql file
-checkdb_cmd = "mysql -h " + config[group]["host"] + " -u " + config[group]["user"] + " -p" + config[group]["password"] + " -e \"use " + config[group]["database"] + "\""
+checkdb_cmd = "mysql -h " + config[group]["host"] + " -u " + config[group]["user"] + " -p" + config[group]["password"] + " -e use " + config[group]["database"]
 create_cmd = "mysql -h " + config[group]["host"] + " -u " + config[group]["user"] + " -p" + config[group]["password"] + " -e \"CREATE DATABASE IF NOT EXISTS " + config[group]["database"] + ";\""
-restore_cmd = "mysql -h " + config[group]["host"] + " -u " + config[group]["user"] + " -p" + config[group]["password"] + " " + config[group]["database"] + " < data/external/" + config[group]["database"] + ".sql"
+restore_cmd = "mysql -h " + config[group]["host"] + " -u " + config[group]["user"] + " -p" + config[group]["password"] + " " + config[group]["database"] + " < data/external/rapids_example.sql"

 try:
-    os.system(checkdb_cmd)
-except:
-    print(config[group]["database"] + " DB already exists.")
-else:
+    print("Checking if " + config[group]["database"] + " database exists")
+    subprocess.run(checkdb_cmd.split(), check = True, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
+except subprocess.CalledProcessError:
+    print(config[group]["database"] + " database does not exist")
+    print("Creating " + config[group]["database"] + " database")
    os.system(create_cmd)
+    print(config[group]["database"] + " database created")
+    print("Restoring rapids_example.sql")
    os.system(restore_cmd)
+    print("rapids_example.sql restored in " + config[group]["database"] + " database")
+else:
+    raise ValueError(config[group]["database"] + " DB already exists")
--- a/src/features/conversation/conversation_base.py
+++ b/src/features/conversation/conversation_base.py
@ -3,8 +3,9 @@ import pandas as pd
 def base_conversation_features(conversation_data, day_segment, requested_features,recordingMinutes,pausedMinutes,expectedMinutes):
    # name of the features this function can compute
    base_features_names = ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration",
-    "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","sumenergy",
-    "avgenergy","sdenergy","minenergy","maxenergy","silencesensedfraction","noisesensedfraction",
+    "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","noisesumenergy",
+    "noiseavgenergy","noisesdenergy","noiseminenergy","noisemaxenergy","voicesumenergy",
+    "voiceavgenergy","voicesdenergy","voiceminenergy","voicemaxenergy","silencesensedfraction","noisesensedfraction",
    "voicesensedfraction","unknownsensedfraction","silenceexpectedfraction","noiseexpectedfraction","voiceexpectedfraction",
    "unknownexpectedfraction","countconversation"]

@ -96,22 +97,36 @@ def base_conversation_features(conversation_data, day_segment, requested_feature
                else:
                    conversation_features["conversation_" + day_segment + "_timelastconversation"] = 0
            
-            if "sumenergy" in features_to_compute:
-                conversation_features["conversation_" + day_segment + "_sumenergy"] = conversation_data.groupby(["local_date"])["double_energy"].sum()
+            if "noisesumenergy" in features_to_compute:
+                conversation_features["conversation_" + day_segment + "_noisesumenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_date"])["double_energy"].sum()

-            if "avgenergy" in features_to_compute:
-                conversation_features["conversation_" + day_segment + "_avgenergy"] = conversation_data.groupby(["local_date"])["double_energy"].mean()
+            if "noiseavgenergy" in features_to_compute:
+                conversation_features["conversation_" + day_segment + "_noiseavgenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_date"])["double_energy"].mean()

-            if "sdenergy" in features_to_compute:
-                conversation_features["conversation_" + day_segment + "_sdenergy"] = conversation_data.groupby(["local_date"])["double_energy"].std()
+            if "noisesdenergy" in features_to_compute:
+                conversation_features["conversation_" + day_segment + "_noisesdenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_date"])["double_energy"].std()

-            if "minenergy" in features_to_compute:
-                conversation_features["conversation_" + day_segment + "_minenergy"] = conversation_data.groupby(["local_date"])["double_energy"].min()
+            if "noiseminenergy" in features_to_compute:
+                conversation_features["conversation_" + day_segment + "_noiseminenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_date"])["double_energy"].min()

-            if "maxenergy" in features_to_compute:
-                conversation_features["conversation_" + day_segment + "_maxenergy"] = conversation_data.groupby(["local_date"])["double_energy"].max()
+            if "noisemaxenergy" in features_to_compute:
+                conversation_features["conversation_" + day_segment + "_noisemaxenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_date"])["double_energy"].max()

+            if "voicesumenergy" in features_to_compute:
+                conversation_features["conversation_" + day_segment + "_voicesumenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_date"])["double_energy"].sum()
+
+            if "voiceavgenergy" in features_to_compute:
+                conversation_features["conversation_" + day_segment + "_voiceavgenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_date"])["double_energy"].mean()
+
+            if "voicesdenergy" in features_to_compute:
+                conversation_features["conversation_" + day_segment + "_voicesdenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_date"])["double_energy"].std()
+
+            if "voiceminenergy" in features_to_compute:
+                conversation_features["conversation_" + day_segment + "_voiceminenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_date"])["double_energy"].min()
+
+            if "voicemaxenergy" in features_to_compute:
+                conversation_features["conversation_" + day_segment + "_voicemaxenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_date"])["double_energy"].max()

            conversation_features = conversation_features.reset_index()
-
+            
    return conversation_features
--- a/src/models/modeling_utils.py
+++ b/src/models/modeling_utils.py
@ -49,7 +49,10 @@ def getMetrics(pred_y, pred_y_prob, true_y):
    metrics = {}
    # metrics for all categories
    metrics["accuracy"] = accuracy_score(true_y, pred_y)
-    metrics["auc"] = roc_auc_score(true_y, pred_y_prob)
+    try:
+        metrics["auc"] = roc_auc_score(true_y, pred_y_prob)
+    except:
+        metrics["auc"] = None
    metrics["kappa"] = cohen_kappa_score(true_y, pred_y)
    # metrics for label 0
    metrics["precision0"] = precision_score(true_y, pred_y, average=None, labels=[0,1], zero_division=0)[0]
--- a/src/models/targets.py
+++ b/src/models/targets.py
@ -3,19 +3,10 @@ import numpy as np

 pid = snakemake.params["pid"]
 summarised = snakemake.params["summarised"]
-targets_ratio_threshold = snakemake.params["targets_ratio_threshold"]
-targets_value_threshold = snakemake.params["targets_value_threshold"]
 participant_info = pd.read_csv(snakemake.input["participant_info"])

 if summarised == "summarised":
-    targets = pd.DataFrame(columns=["pid", "target"])
-
-    if not participant_info.empty:
-        cesds = participant_info.loc[0, ["preop_cesd_total", "inpatient_cesd_total", "postop_cesd_total", "3month_cesd_total"]]
-        # targets: 1 => 50% (ceiling) or more of available CESD scores were 16 or higher; 0 => otherwise
-        num_threshold = int((cesds.count() + 1) * targets_ratio_threshold)
-        target = 1 if cesds.apply(lambda x : 1 if x >= targets_value_threshold else 0).sum() >= num_threshold else 0
-        targets.loc[0, :] = [pid, target]
+    raise ValueError("Do not support summarised features for example dataset.")

 elif summarised == "notsummarised":
    targets = participant_info[["local_date", "target"]]