Merge remote-tracking branch 'origin/master'

Include all participants again.
Drop NaN targets.
2022-04-12 17:27:25 +02:00 · 2022-04-12 17:20:19 +02:00 · 2022-04-12 17:01:49 +02:00 · 2022-04-12 16:59:42 +02:00 · 2022-04-12 16:55:01 +02:00 · 2022-04-12 14:23:58 +02:00
25 changed files with 1132 additions and 16 deletions
--- a/26
+++ b/26
@ -164,6 +164,16 @@ for provider in config["PHONE_CONVERSATION"]["PROVIDERS"].keys():
        files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
        files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")

+for provider in config["PHONE_ESM"]["PROVIDERS"].keys():
+    if config["PHONE_ESM"]["PROVIDERS"][provider]["COMPUTE"]:
+        files_to_compute.extend(expand("data/raw/{pid}/phone_esm_raw.csv",pid=config["PIDS"]))
+        files_to_compute.extend(expand("data/raw/{pid}/phone_esm_with_datetime.csv",pid=config["PIDS"]))
+        files_to_compute.extend(expand("data/interim/{pid}/phone_esm_clean.csv",pid=config["PIDS"]))
+        files_to_compute.extend(expand("data/interim/{pid}/phone_esm_features/phone_esm_{language}_{provider_key}.csv",pid=config["PIDS"],language=get_script_language(config["PHONE_ESM"]["PROVIDERS"][provider]["SRC_SCRIPT"]),provider_key=provider.lower()))
+        files_to_compute.extend(expand("data/processed/features/{pid}/phone_esm.csv", pid=config["PIDS"]))
+        #files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv",pid=config["PIDS"]))
+        #files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
+
 # We can delete these if's as soon as we add feature PROVIDERS to any of these sensors
 if isinstance(config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"], dict):
    for provider in config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"].keys():
@ -403,9 +413,19 @@ for provider in config["ALL_CLEANING_OVERALL"]["PROVIDERS"].keys():
    if config["ALL_CLEANING_OVERALL"]["PROVIDERS"][provider]["COMPUTE"]:
        files_to_compute.extend(expand("data/processed/features/all_participants/all_sensor_features_cleaned_" + provider.lower() +".csv"))

-# Demographic features
-files_to_compute.extend(expand("data/raw/baseline_merged.csv"))
-files_to_compute.extend(expand("data/raw/{pid}/participant_baseline_raw.csv", pid=config["PIDS"]))
+# Baseline features
+if config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["COMPUTE"]:
+    files_to_compute.extend(expand("data/raw/baseline_merged.csv"))
+    files_to_compute.extend(expand("data/raw/{pid}/participant_baseline_raw.csv", pid=config["PIDS"]))
+    files_to_compute.extend(expand("data/interim/{pid}/baseline_questionnaires.csv", pid=config["PIDS"]))
+    files_to_compute.extend(expand("data/processed/features/{pid}/baseline_features.csv", pid=config["PIDS"]))
+
+# Targets (labels)
+if config["PARAMS_FOR_ANALYSIS"]["TARGET"]["COMPUTE"]:
+    files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/input.csv", pid=config["PIDS"]))
+    files_to_compute.extend(expand("data/processed/models/population_model/input.csv"))
+
+#files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/output_{cv_method}/baselines.csv", pid=config["PIDS"], cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"]))

 rule all:
    input:
--- a/config.yaml
+++ b/config.yaml
@ -234,6 +234,15 @@ PHONE_DATA_YIELD:
      MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1, minimum percentage of valid minutes in an hour to be considered valid.
      SRC_SCRIPT: src/features/phone_data_yield/rapids/main.R

+PHONE_ESM:
+  CONTAINER: esm
+  PROVIDERS:
+    STRAW:
+      COMPUTE: True
+      SCALES: ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support"]
+      FEATURES: [mean]
+      SRC_SCRIPT: src/features/phone_esm/straw/main.py
+
 # See https://www.rapids.science/latest/features/phone-keyboard/
 PHONE_KEYBOARD:
  CONTAINER: keyboard
@ -561,7 +570,7 @@ HISTOGRAM_PHONE_DATA_YIELD:

 # See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#2-heatmaps-of-overall-data-yield
 HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT:
-  PLOT: True
+  PLOT: False
  TIME: RELATIVE_TIME # ABSOLUTE_TIME or RELATIVE_TIME

 # See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#3-heatmap-of-recorded-phone-sensors
@ -629,10 +638,16 @@ ALL_CLEANING_OVERALL:

 PARAMS_FOR_ANALYSIS:
  BASELINE:
+    COMPUTE: True
    FOLDER: data/external/baseline
    CONTAINER: [results-survey637813_final.csv,  # Slovenia
                results-survey358134_final.csv,  # Belgium 1
                results-survey413767_final.csv  # Belgium 2
    ]
-    FEATURES: [age, gender]
+    QUESTION_LIST: survey637813+question_text.csv
+    FEATURES: [age, gender, startlanguage, limesurvey_demand, limesurvey_control, limesurvey_demand_control_ratio, limesurvey_demand_control_ratio_quartile]
    CATEGORICAL_FEATURES: [gender]
+
+  TARGET:
+    COMPUTE: True
+    LABEL: PANAS_negative_affect_mean
--- a/docs/change-log.md
+++ b/docs/change-log.md
@ -1,4 +1,9 @@
 # Change Log
+## v1.8.0
+- Add data stream for AWARE Micro server
+- Fix the NA bug in PHONE_LOCATIONS BARNETT provider
+- Fix the bug of data type for call_duration field
+- Fix the index bug of heatmap_sensors_per_minute_per_time_segment
 ## v1.7.1
 - Update docs for Git Flow section
 - Update RAPIDS paper information
--- a/docs/datastreams/aware-micro-mysql.md
+++ b/docs/datastreams/aware-micro-mysql.md
@ -0,0 +1,15 @@
+# `aware_micro_mysql`
+
+This [data stream](../../datastreams/data-streams-introduction) handles iOS and Android sensor data collected with the [AWARE Framework's](https://awareframework.com/) [AWARE Micro](https://github.com/denzilferreira/aware-micro) server and stored in a MySQL database.
+
+## Container
+A MySQL database with a table per sensor, each containing the data for all participants. Sensor data is stored in a JSON field within each table called `data`
+
+The script to connect and download data from this container is at:
+```bash
+src/data/streams/aware_micro_mysql/container.R
+```
+
+## Format
+
+--8<---- "docs/snippets/aware_format.md"
--- a/docs/datastreams/data-streams-introduction.md
+++ b/docs/datastreams/data-streams-introduction.md
@ -16,6 +16,7 @@ For reference, these are the data streams we currently support:
 | Data Stream | Device | Format | Container | Docs
 |--|--|--|--|--|
 | `aware_mysql`| Phone | AWARE app | MySQL | [link](../aware-mysql)
+| `aware_micro_mysql`| Phone | AWARE Micro server | MySQL | [link](../aware-micro-mysql)
 | `aware_csv`| Phone | AWARE app | CSV files | [link](../aware-csv)
 | `aware_influxdb` (beta)| Phone | AWARE app | InfluxDB | [link](../aware-influxdb)
 | `fitbitjson_mysql`| Fitbit | JSON (per [Fitbit's API](https://dev.fitbit.com/build/reference/web-api/)) | MySQL | [link](../fitbitjson-mysql)
--- a/mkdocs.yml
+++ b/mkdocs.yml
@ -85,6 +85,7 @@ nav:
      - Introduction: datastreams/data-streams-introduction.md
      - Phone:
        - aware_mysql: datastreams/aware-mysql.md
+        - aware_micro_mysql: datastreams/aware-micro-mysql.md
        - aware_csv: datastreams/aware-csv.md
        - aware_influxdb (beta): datastreams/aware-influxdb.md
        - Mandatory Phone Format: datastreams/mandatory-phone-format.md
--- a/rules/features.smk
+++ b/rules/features.smk
@ -324,6 +324,27 @@ rule conversation_r_features:
    script:
        "../src/features/entry.R"

+rule preprocess_esm:
+    input: "data/raw/{pid}/phone_esm_with_datetime.csv"
+    params:
+        scales=lambda wildcards: config["PHONE_ESM"]["PROVIDERS"]["STRAW"]["SCALES"]
+    output: "data/interim/{pid}/phone_esm_clean.csv"
+    script:
+        "../src/features/phone_esm/straw/preprocess.py"
+
+rule esm_features:
+    input:
+        sensor_data = "data/interim/{pid}/phone_esm_clean.csv",
+        time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
+    params:
+        provider = lambda wildcards: config["PHONE_ESM"]["PROVIDERS"][wildcards.provider_key.upper()],
+        provider_key = "{provider_key}",
+        sensor_key = "phone_esm",
+        scales=lambda wildcards: config["PHONE_ESM"]["PROVIDERS"][wildcards.provider_key.upper()]["SCALES"]
+    output: "data/interim/{pid}/phone_esm_features/phone_esm_python_{provider_key}.csv"
+    script:
+        "../src/features/entry.py"
+
 rule phone_keyboard_python_features:
    input:
        sensor_data = "data/raw/{pid}/phone_keyboard_with_datetime.csv",
--- a/rules/models.smk
+++ b/rules/models.smk
@ -14,3 +14,38 @@ rule download_baseline_data:
        "data/raw/{pid}/participant_baseline_raw.csv"
    script:
        "../src/data/download_baseline_data.py"
+
+rule baseline_features:
+    input:
+        "data/raw/{pid}/participant_baseline_raw.csv"
+    params:
+        pid="{pid}",
+        features=config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["FEATURES"],
+        question_filename=config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["FOLDER"] + "/" + config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["QUESTION_LIST"]
+    output:
+        interim="data/interim/{pid}/baseline_questionnaires.csv",
+        features="data/processed/features/{pid}/baseline_features.csv"
+    script:
+        "../src/data/baseline_features.py"
+
+rule select_target:
+    input:
+        cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned_rapids.csv"
+    params:
+        target_variable = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["LABEL"]
+    output:
+        "data/processed/models/individual_model/{pid}/input.csv"
+    script:
+        "../src/models/select_targets.py"
+
+rule merge_features_and_targets_for_population_model:
+    input:
+        cleaned_sensor_features = "data/processed/features/all_participants/all_sensor_features_cleaned_rapids.csv",
+        demographic_features = expand("data/processed/features/{pid}/baseline_features.csv", pid=config["PIDS"]),
+    params:
+        target_variable=config["PARAMS_FOR_ANALYSIS"]["TARGET"]["LABEL"]
+    output:
+        "data/processed/models/population_model/input.csv"
+    script:
+        "../src/models/merge_features_and_targets_for_population_model.py"
+
--- a/rules/preprocessing.smk
+++ b/rules/preprocessing.smk
@ -177,7 +177,6 @@ rule resample_episodes_with_datetime:
    script:
        "../src/data/datetime/readable_datetime.R"

-
 rule phone_application_categories:
    input:
        "data/raw/{pid}/phone_applications_{type}_with_datetime.csv"
--- a/src/data/baseline_features.py
+++ b/src/data/baseline_features.py
@ -0,0 +1,179 @@
+import numpy as np
+import pandas as pd
+
+pid = snakemake.params["pid"]
+requested_features = snakemake.params["features"]
+baseline_interim = pd.DataFrame(columns=["qid", "question", "score_original", "score"])
+baseline_features = pd.DataFrame(columns=requested_features)
+question_filename = snakemake.params["question_filename"]
+
+JCQ_DEMAND = "JobEisen"
+JCQ_CONTROL = "JobControle"
+
+dict_JCQ_demand_control_reverse = {
+    JCQ_DEMAND: {
+        3: " [Od mene se ne zahteva,",
+        4: " [Imam dovolj časa, da končam",
+        5: " [Pri svojem delu se ne srečujem s konfliktnimi",
+    },
+    JCQ_CONTROL: {
+        2: " |Moje delo vključuje veliko ponavljajočega",
+        6: " [Pri svojem delu imam zelo malo svobode",
+    },
+}
+
+LIMESURVEY_JCQ_MIN = 1
+LIMESURVEY_JCQ_MAX = 4
+
+DEMAND_CONTROL_RATIO_MIN = 5 / (9 * 4)
+DEMAND_CONTROL_RATIO_MAX = (4 * 5) / 9
+
+JCQ_NORMS = {
+    "F": {
+        0: DEMAND_CONTROL_RATIO_MIN,
+        1: 0.45,
+        2: 0.52,
+        3: 0.62,
+        4: DEMAND_CONTROL_RATIO_MAX,
+    },
+    "M": {
+        0: DEMAND_CONTROL_RATIO_MIN,
+        1: 0.41,
+        2: 0.48,
+        3: 0.56,
+        4: DEMAND_CONTROL_RATIO_MAX,
+    },
+}
+
+participant_info = pd.read_csv(snakemake.input[0], parse_dates=["date_of_birth"])
+
+if not participant_info.empty:
+    if "age" in requested_features:
+        now = pd.Timestamp("now")
+        baseline_features.loc[0, "age"] = (
+            now - participant_info.loc[0, "date_of_birth"]
+        ).days / 365.25245
+    if "gender" in requested_features:
+        baseline_features.loc[0, "gender"] = participant_info.loc[0, "gender"]
+    if "startlanguage" in requested_features:
+        baseline_features.loc[0, "startlanguage"] = participant_info.loc[
+            0, "startlanguage"
+        ]
+    if (
+        ("limesurvey_demand" in requested_features)
+        or ("limesurvey_control" in requested_features)
+        or ("limesurvey_demand_control_ratio" in requested_features)
+    ):
+        participant_info_t = participant_info.T
+        rows_baseline = participant_info_t.index
+
+        if ("limesurvey_demand" in requested_features) or (
+            "limesurvey_demand_control_ratio" in requested_features
+        ):
+            # Find questions about demand, but disregard time (duration of filling in questionnaire)
+            rows_demand = rows_baseline.str.startswith(
+                JCQ_DEMAND
+            ) & ~rows_baseline.str.endswith("Time")
+            limesurvey_demand = (
+                participant_info_t[rows_demand]
+                .reset_index()
+                .rename(columns={"index": "question", 0: "score_original"})
+            )
+            # Extract question IDs from names such as JobEisen[3]
+            limesurvey_demand["qid"] = (
+                limesurvey_demand["question"].str.extract(r"\[(\d+)\]").astype(int)
+            )
+            limesurvey_demand["score"] = limesurvey_demand["score_original"]
+            # Identify rows that include questions to be reversed.
+            rows_demand_reverse = limesurvey_demand["qid"].isin(
+                dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys()
+            )
+            # Reverse the score, so that the maximum value becomes the minimum etc.
+            limesurvey_demand.loc[rows_demand_reverse, "score"] = (
+                LIMESURVEY_JCQ_MAX
+                + LIMESURVEY_JCQ_MIN
+                - limesurvey_demand.loc[rows_demand_reverse, "score_original"]
+            )
+            baseline_interim = pd.concat([baseline_interim, limesurvey_demand], axis=0, ignore_index=True)
+            if "demand" in requested_features:
+                baseline_features.loc[0, "limesurvey_demand"] = limesurvey_demand[
+                    "score"
+                ].sum()
+
+        if ("limesurvey_control" in requested_features) or (
+            "limesurvey_demand_control_ratio" in requested_features
+        ):
+            # Find questions about control, but disregard time (duration of filling in questionnaire)
+            rows_control = rows_baseline.str.startswith(
+                JCQ_CONTROL
+            ) & ~rows_baseline.str.endswith("Time")
+            limesurvey_control = (
+                participant_info_t[rows_control]
+                .reset_index()
+                .rename(columns={"index": "question", 0: "score_original"})
+            )
+            # Extract question IDs from names such as JobControle[3]
+            limesurvey_control["qid"] = (
+                limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
+            )
+            limesurvey_control["score"] = limesurvey_control["score_original"]
+            # Identify rows that include questions to be reversed.
+            rows_control_reverse = limesurvey_control["qid"].isin(
+                dict_JCQ_demand_control_reverse[JCQ_CONTROL].keys()
+            )
+            # Reverse the score, so that the maximum value becomes the minimum etc.
+            limesurvey_control.loc[rows_control_reverse, "score"] = (
+                LIMESURVEY_JCQ_MAX
+                + LIMESURVEY_JCQ_MIN
+                - limesurvey_control.loc[rows_control_reverse, "score_original"]
+            )
+
+            baseline_interim = pd.concat([baseline_interim, limesurvey_control], axis=0, ignore_index=True)
+
+            if "limesurvey_control" in requested_features:
+                baseline_features.loc[0, "limesurvey_control"] = limesurvey_control[
+                    "score"
+                ].sum()
+
+        if "limesurvey_demand_control_ratio" in requested_features:
+            limesurvey_demand_control_ratio = (
+                limesurvey_demand["score"].sum() / limesurvey_control["score"].sum()
+            )
+            if (
+                JCQ_NORMS[participant_info.loc[0, "gender"]][0]
+                <= limesurvey_demand_control_ratio
+                < JCQ_NORMS[participant_info.loc[0, "gender"]][1]
+            ):
+                limesurvey_quartile = 1
+            elif (
+                JCQ_NORMS[participant_info.loc[0, "gender"]][1]
+                <= limesurvey_demand_control_ratio
+                < JCQ_NORMS[participant_info.loc[0, "gender"]][2]
+            ):
+                limesurvey_quartile = 2
+            elif (
+                JCQ_NORMS[participant_info.loc[0, "gender"]][2]
+                <= limesurvey_demand_control_ratio
+                < JCQ_NORMS[participant_info.loc[0, "gender"]][3]
+            ):
+                limesurvey_quartile = 3
+            elif (
+                JCQ_NORMS[participant_info.loc[0, "gender"]][3]
+                <= limesurvey_demand_control_ratio
+                < JCQ_NORMS[participant_info.loc[0, "gender"]][4]
+            ):
+                limesurvey_quartile = 4
+            else:
+                limesurvey_quartile = np.nan
+
+            baseline_features.loc[
+                0, "limesurvey_demand_control_ratio"
+            ] = limesurvey_demand_control_ratio
+            baseline_features.loc[
+                0, "limesurvey_demand_control_ratio_quartile"
+            ] = limesurvey_quartile
+
+if not baseline_interim.empty:
+    baseline_interim.to_csv(snakemake.output["interim"], index=False, encoding="utf-8")
+
+baseline_features.to_csv(snakemake.output["features"], index=False, encoding="utf-8")
--- a/src/data/merge_baseline_data.py
+++ b/src/data/merge_baseline_data.py
@ -24,10 +24,6 @@ baseline = (
 )

 baseline.rename(columns=VARIABLES_TO_TRANSLATE, copy=False, inplace=True)
-now = pd.Timestamp("now")
-baseline = baseline.assign(
-    age=lambda x: (now - x.date_of_birth).dt.days / 365.25245,
-)

 baseline.to_csv(snakemake.output[0],
                index=False,
--- a/src/data/streams/aware_micro_mysql/container.R
+++ b/src/data/streams/aware_micro_mysql/container.R
@ -0,0 +1,85 @@
+# if you need a new package, you should add it with renv::install(package) so your renv venv is updated
+library(RMariaDB)
+library(yaml)
+
+#' @description
+#' Auxiliary function to parse the connection credentials from a specifc group in ./credentials.yaml
+#' You can reause most of this function if you are connection to a DB or Web API.
+#' It's OK to delete this function if you don't need credentials, e.g., you are pulling data from a CSV for example.
+#' @param group the yaml key containing the credentials to connect to a database
+#' @preturn dbEngine a database engine (connection) ready to perform queries
+get_db_engine <- function(group){
+  # The working dir is aways RAPIDS root folder, so your credentials file is always /credentials.yaml
+  credentials <- read_yaml("./credentials.yaml")
+  if(!group %in% names(credentials))
+    stop(paste("The credentials group",group, "does not exist in ./credentials.yaml. The only groups that exist in that file are:", paste(names(credentials), collapse = ","), ". Did you forget to set the group in [PHONE_DATA_STREAMS][aware_mysql][DATABASE_GROUP] in config.yaml?"))
+  dbEngine <- dbConnect(MariaDB(), db = credentials[[group]][["database"]],
+                                    username = credentials[[group]][["user"]],
+                                    password = credentials[[group]][["password"]],
+                                    host = credentials[[group]][["host"]],
+                                    port = credentials[[group]][["port"]])
+  return(dbEngine)
+}
+
+# This file gets executed for each PHONE_SENSOR of each participant
+# If you are connecting to a database the env file containing its credentials is available at "./.env"
+# If you are reading a CSV file instead of a DB table, the @param sensor_container wil contain the file path as set in config.yaml
+# You are not bound to databases or files, you can query a web API or whatever data source you need.
+
+#' @description
+#' RAPIDS allows users to use the keyword "infer" (previously "multiple") to automatically infer the mobile Operative System a device was running.
+#' If you have a way to infer the OS of a device ID, implement this function. For example, for AWARE data we use the "aware_device" table.
+#'  
+#' If you don't have a way to infer the OS, call stop("Error Message") so other users know they can't use "infer" or the inference failed, 
+#' and they have to assign the OS manually in the participant file
+#' 
+#' @param stream_parameters The PHONE_STREAM_PARAMETERS key in config.yaml. If you need specific parameters add them there.
+#' @param device A device ID string
+#' @return The OS the device ran, "android" or "ios"
+
+infer_device_os <- function(stream_parameters, device){
+  dbEngine <- get_db_engine(stream_parameters$DATABASE_GROUP)
+  query <- paste0("SELECT device_id,brand FROM aware_device WHERE device_id = '", device, "'")
+  message(paste0("Executing the following query to infer phone OS: ", query)) 
+  os <- dbGetQuery(dbEngine, query)
+  dbDisconnect(dbEngine)
+  
+  if(nrow(os) > 0)
+    return(os %>% mutate(os = ifelse(brand == "iPhone", "ios", "android")) %>% pull(os))
+  else
+    stop(paste("We cannot infer the OS of the following device id because it does not exist in the aware_device table:", device))
+  
+  return(os)
+}
+
+#' @description
+#' Gets the sensor data for a specific device id from a database table, file or whatever source you want to query
+#' 
+#' @param stream_parameters The PHONE_STREAM_PARAMETERS key in config.yaml. If you need specific parameters add them there.
+#' @param device A device ID string
+#' @param sensor_container database table or file containing the sensor data for all participants. This is the PHONE_SENSOR[CONTAINER] key in config.yaml
+#' @param columns the columns needed from this sensor (we recommend to only return these columns instead of every column in sensor_container)
+#' @return A dataframe with the sensor data for device
+
+pull_data <- function(stream_parameters, device, sensor, sensor_container, columns){
+  dbEngine <- get_db_engine(stream_parameters$DATABASE_GROUP)
+
+  select_items <- c()
+  for (column in columns) {
+    select_items <- append(select_items, paste0("data->>'$.", column, "' ", column))
+  }
+
+  query <- paste0("SELECT ", paste(select_items, collapse = ",")," FROM ", sensor_container, " WHERE ", columns$DEVICE_ID ," = '", device,"'")
+
+  # Letting the user know what we are doing
+  message(paste0("Executing the following query to download data: ", query)) 
+  sensor_data <- dbGetQuery(dbEngine, query)
+  
+  dbDisconnect(dbEngine)
+  
+  if(nrow(sensor_data) == 0)
+    warning(paste("The device '", device,"' did not have data in ", sensor_container))
+
+  return(sensor_data)
+}
+
--- a/src/data/streams/aware_micro_mysql/format.yaml
+++ b/src/data/streams/aware_micro_mysql/format.yaml
@ -0,0 +1,337 @@
+PHONE_ACCELEROMETER:
+  ANDROID:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      DOUBLE_VALUES_0: double_values_0
+      DOUBLE_VALUES_1: double_values_1
+      DOUBLE_VALUES_2: double_values_2
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+  IOS:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      DOUBLE_VALUES_0: double_values_0
+      DOUBLE_VALUES_1: double_values_1
+      DOUBLE_VALUES_2: double_values_2
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+
+PHONE_ACTIVITY_RECOGNITION:
+  ANDROID:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      ACTIVITY_NAME: activity_name
+      ACTIVITY_TYPE: activity_type
+      CONFIDENCE: confidence
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+  IOS:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      ACTIVITY_NAME: FLAG_TO_MUTATE
+      ACTIVITY_TYPE: FLAG_TO_MUTATE
+      CONFIDENCE: FLAG_TO_MUTATE
+    MUTATION:
+      COLUMN_MAPPINGS:
+        ACTIVITIES: activities
+        CONFIDENCE: confidence
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+        - "src/data/streams/mutations/phone/aware/activity_recogniton_ios_unification.R"
+
+PHONE_APPLICATIONS_CRASHES:
+  ANDROID:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      PACKAGE_NAME: package_name
+      APPLICATION_NAME: application_name
+      APPLICATION_VERSION: application_version
+      ERROR_SHORT: error_short
+      ERROR_LONG: error_long
+      ERROR_CONDITION: error_condition
+      IS_SYSTEM_APP: is_system_app
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+
+PHONE_APPLICATIONS_FOREGROUND:
+  ANDROID:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      PACKAGE_NAME: package_name
+      APPLICATION_NAME: application_name
+      IS_SYSTEM_APP: is_system_app
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+
+PHONE_APPLICATIONS_NOTIFICATIONS:
+  ANDROID:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      PACKAGE_NAME: package_name
+      APPLICATION_NAME: application_name
+      TEXT: text
+      SOUND: sound
+      VIBRATE: vibrate
+      DEFAULTS: defaults
+      FLAGS: flags
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+
+PHONE_BATTERY:
+  ANDROID:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      BATTERY_STATUS: battery_status
+      BATTERY_LEVEL: battery_level
+      BATTERY_SCALE: battery_scale
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+  IOS:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      BATTERY_STATUS: FLAG_TO_MUTATE
+      BATTERY_LEVEL: battery_level
+      BATTERY_SCALE: battery_scale
+    MUTATION:
+      COLUMN_MAPPINGS:
+        BATTERY_STATUS: battery_status
+      SCRIPTS:
+        - "src/data/streams/mutations/phone/aware/battery_ios_unification.R"
+
+PHONE_BLUETOOTH:
+  ANDROID:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      BT_ADDRESS: bt_address
+      BT_NAME: bt_name
+      BT_RSSI: bt_rssi
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+  IOS:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      BT_ADDRESS: bt_address
+      BT_NAME: bt_name
+      BT_RSSI: bt_rssi
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+
+PHONE_CALLS:
+  ANDROID:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      CALL_TYPE: call_type
+      CALL_DURATION: call_duration
+      TRACE: trace
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+  IOS:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      CALL_TYPE: FLAG_TO_MUTATE
+      CALL_DURATION: call_duration
+      TRACE: trace
+    MUTATION:
+      COLUMN_MAPPINGS:
+        CALL_TYPE: call_type
+      SCRIPTS:
+        - "src/data/streams/mutations/phone/aware/calls_ios_unification.R"
+
+PHONE_CONVERSATION:
+  ANDROID:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      DOUBLE_ENERGY: double_energy
+      INFERENCE: inference
+      DOUBLE_CONVO_START: double_convo_start
+      DOUBLE_CONVO_END: double_convo_end
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+  IOS:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      DOUBLE_ENERGY: double_energy
+      INFERENCE: inference
+      DOUBLE_CONVO_START: double_convo_start
+      DOUBLE_CONVO_END: double_convo_end
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+        - "src/data/streams/mutations/phone/aware/conversation_ios_timestamp.R"
+
+PHONE_KEYBOARD:
+  ANDROID:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      PACKAGE_NAME: package_name
+      BEFORE_TEXT: before_text
+      CURRENT_TEXT: current_text
+      IS_PASSWORD: is_password
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+
+PHONE_LIGHT:
+  ANDROID:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      DOUBLE_LIGHT_LUX: double_light_lux
+      ACCURACY: accuracy
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+
+PHONE_LOCATIONS:
+  ANDROID:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      DOUBLE_LATITUDE: double_latitude
+      DOUBLE_LONGITUDE: double_longitude
+      DOUBLE_BEARING: double_bearing
+      DOUBLE_SPEED: double_speed
+      DOUBLE_ALTITUDE: double_altitude
+      PROVIDER: provider
+      ACCURACY: accuracy
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+  IOS:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      DOUBLE_LATITUDE: double_latitude
+      DOUBLE_LONGITUDE: double_longitude
+      DOUBLE_BEARING: double_bearing
+      DOUBLE_SPEED: double_speed
+      DOUBLE_ALTITUDE: double_altitude
+      PROVIDER: provider
+      ACCURACY: accuracy
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+
+PHONE_LOG:
+  ANDROID:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      LOG_MESSAGE: log_message
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+  IOS:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      LOG_MESSAGE: log_message
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+
+PHONE_MESSAGES:
+  ANDROID:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      MESSAGE_TYPE: message_type
+      TRACE: trace
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+
+PHONE_SCREEN:
+  ANDROID:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      SCREEN_STATUS: screen_status
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+  IOS:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      SCREEN_STATUS: FLAG_TO_MUTATE
+    MUTATION:
+      COLUMN_MAPPINGS:
+        SCREEN_STATUS: screen_status
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+        - "src/data/streams/mutations/phone/aware/screen_ios_unification.R"
+
+PHONE_WIFI_CONNECTED:
+  ANDROID:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      MAC_ADDRESS: mac_address
+      SSID: ssid
+      BSSID: bssid
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+  IOS:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      MAC_ADDRESS: mac_address
+      SSID: ssid
+      BSSID: bssid
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+
+PHONE_WIFI_VISIBLE:
+  ANDROID:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      SSID: ssid
+      BSSID: bssid
+      SECURITY: security
+      FREQUENCY: frequency
+      RSSI: rssi
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+  IOS:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: timestamp
+      DEVICE_ID: device_id
+      SSID: ssid
+      BSSID: bssid
+      SECURITY: security
+      FREQUENCY: frequency
+      RSSI: rssi
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS: # List any python or r scripts that mutate your raw data
+
--- a/src/data/streams/aware_postgresql/format.yaml
+++ b/src/data/streams/aware_postgresql/format.yaml
@ -183,6 +183,21 @@ PHONE_CONVERSATION:
      SCRIPTS: # List any python or r scripts that mutate your raw data
        - "src/data/streams/mutations/phone/aware/conversation_ios_timestamp.R"

+PHONE_ESM:
+  ANDROID:
+    RAPIDS_COLUMN_MAPPINGS:
+      TIMESTAMP: double_esm_user_answer_timestamp
+      DEVICE_ID: device_id
+      ESM_STATUS: esm_status
+      ESM_USER_ANSWER: esm_user_answer
+      ESM_JSON: esm_json
+      ESM_TRIGGER: esm_trigger
+      ESM_SESSION: esm_session
+      ESM_NOTIFICATION_ID: esm_notification_id
+    MUTATION:
+      COLUMN_MAPPINGS:
+      SCRIPTS:
+
 PHONE_KEYBOARD:
  ANDROID:
    RAPIDS_COLUMN_MAPPINGS:
--- a/src/data/streams/mutations/phone/aware/calls_ios_unification.R
+++ b/src/data/streams/mutations/phone/aware/calls_ios_unification.R
@ -39,7 +39,7 @@ unify_ios_calls <- function(ios_calls){
                        assigned_segments = first(assigned_segments))
        }
        else {
-            ios_calls <- ios_calls %>% summarise(call_type_sequence = paste(call_type, collapse = ","), call_duration = sum(call_duration),  timestamp = first(timestamp), device_id = first(device_id))
+            ios_calls <- ios_calls %>% summarise(call_type_sequence = paste(call_type, collapse = ","), call_duration = sum(as.numeric(call_duration)),  timestamp = first(timestamp), device_id = first(device_id))
        }
        ios_calls <- ios_calls %>% mutate(call_type = case_when(
            call_type_sequence == "1,2,4" | call_type_sequence == "2,1,4" ~ 1, # incoming
--- a/src/data/streams/rapids_columns.yaml
+++ b/src/data/streams/rapids_columns.yaml
@ -67,6 +67,16 @@ PHONE_CONVERSATION:
  - DOUBLE_CONVO_START
  - DOUBLE_CONVO_END

+PHONE_ESM:
+  - TIMESTAMP
+  - DEVICE_ID
+  - ESM_STATUS
+  - ESM_USER_ANSWER
+  - ESM_JSON
+  - ESM_TRIGGER
+  - ESM_SESSION
+  - ESM_NOTIFICATION_ID
+
 PHONE_KEYBOARD:
  - TIMESTAMP
  - DEVICE_ID
--- a/src/features/phone_esm/straw/esm_JCQ.py
+++ b/src/features/phone_esm/straw/esm_JCQ.py
@ -0,0 +1,108 @@
+import pandas as pd
+
+JCQ_ORIGINAL_MAX = 4
+JCQ_ORIGINAL_MIN = 1
+
+dict_JCQ_demand_control_reverse = {
+    75: (
+        "I was NOT asked",
+        "Men legde mij geen overdreven",
+        "Men legde mij GEEN overdreven",  # Capitalized in some versions
+        "Od mene se NI zahtevalo",
+    ),
+    76: (
+        "I had enough time to do my work",
+        "Ik had voldoende tijd om mijn werk",
+        "Imela sem dovolj časa, da končam",
+        "Imel sem dovolj časa, da končam",
+    ),
+    77: (
+        "I was free of conflicting demands",
+        "Er werden mij op het werk geen tegenstrijdige",
+        "Er werden mij op het werk GEEN tegenstrijdige",  # Capitalized in some versions
+        "Pri svojem delu se NISEM srečeval",
+    ),
+    79: (
+        "My job involved a lot of repetitive work",
+        "Mijn taak omvatte veel repetitief werk",
+        "Moje delo je vključevalo veliko ponavljajočega",
+    ),
+    85: (
+        "On my job, I had very little freedom",
+        "In mijn taak had ik zeer weinig vrijheid",
+        "Pri svojem delu sem imel zelo malo svobode",
+        "Pri svojem delu sem imela zelo malo svobode",
+    ),
+}
+
+
+def reverse_jcq_demand_control_scoring(
+    df_esm_jcq_demand_control: pd.DataFrame,
+) -> pd.DataFrame:
+    """
+    This function recodes answers in Job content questionnaire by first incrementing them by 1,
+    to be in line with original (1-4) scoring.
+    Then, some answers are reversed (i.e. 1 becomes 4 etc.), because the questions are negatively phrased.
+    These answers are listed in dict_JCQ_demand_control_reverse and identified by their question ID.
+    However, the existing data is checked against literal phrasing of these questions
+        to protect against wrong numbering of questions (differing question IDs).
+
+    Parameters
+    ----------
+    df_esm_jcq_demand_control: pd.DataFrame
+        A cleaned up dataframe, which must also include esm_user_answer_numeric.
+
+    Returns
+    -------
+    df_esm_jcq_demand_control: pd.DataFrame
+        The same dataframe with a column esm_user_score containing answers recoded and reversed.
+    """
+    df_esm_jcq_demand_control_unique_answers = (
+        df_esm_jcq_demand_control.groupby("question_id")
+        .esm_instructions.value_counts()
+        .rename()
+        .reset_index()
+    )
+    # Tabulate all possible answers to each question (group by question ID).
+    for q_id in dict_JCQ_demand_control_reverse.keys():
+        # Look through all answers that need to be reversed.
+        possible_answers = df_esm_jcq_demand_control_unique_answers.loc[
+            df_esm_jcq_demand_control_unique_answers["question_id"] == q_id,
+            "esm_instructions",
+        ]
+        # These are all answers to a given question (by q_id).
+        answers_matches = possible_answers.str.startswith(
+            dict_JCQ_demand_control_reverse.get(q_id)
+        )
+        # See if they are expected, i.e. included in the dictionary.
+        if ~answers_matches.all():
+            print("One of the answers that occur in the data should not be reversed.")
+            print("This was the answer found in the data: ")
+            raise KeyError(possible_answers[~answers_matches])
+            # In case there is an unexpected answer, raise an exception.
+
+    try:
+        df_esm_jcq_demand_control = df_esm_jcq_demand_control.assign(
+            esm_user_score=lambda x: x.esm_user_answer_numeric + 1
+        )
+        # Increment the original answer by 1
+        # to keep in line with traditional scoring (JCQ_ORIGINAL_MIN - JCQ_ORIGINAL_MAX).
+        df_esm_jcq_demand_control[
+            df_esm_jcq_demand_control["question_id"].isin(
+                dict_JCQ_demand_control_reverse.keys()
+            )
+        ] = df_esm_jcq_demand_control[
+            df_esm_jcq_demand_control["question_id"].isin(
+                dict_JCQ_demand_control_reverse.keys()
+            )
+        ].assign(
+            esm_user_score=lambda x: JCQ_ORIGINAL_MAX
+            + JCQ_ORIGINAL_MIN
+            - x.esm_user_score
+        )
+        # Reverse the items that require it.
+    except AttributeError as e:
+        print("Please, clean the dataframe first using features.esm.clean_up_esm.")
+        print(e)
+
+    return df_esm_jcq_demand_control
--- a/src/features/phone_esm/straw/esm_preprocess.py
+++ b/src/features/phone_esm/straw/esm_preprocess.py
@ -0,0 +1,135 @@
+import json
+
+import numpy as np
+import pandas as pd
+
+ESM_TYPE = {
+    "text": 1,
+    "radio": 2,
+    "checkbox": 3,
+    "likert": 4,
+    "quick_answers": 5,
+    "scale": 6,
+    "datetime": 7,
+    "pam": 8,
+    "number": 9,
+    "web": 10,
+    "date": 11,
+}
+
+QUESTIONNAIRE_IDS = {
+    "sleep_quality": 1,
+    "PANAS_positive_affect": 8,
+    "PANAS_negative_affect": 9,
+    "JCQ_job_demand": 10,
+    "JCQ_job_control": 11,
+    "JCQ_supervisor_support": 12,
+    "JCQ_coworker_support": 13,
+    "PFITS_supervisor": 14,
+    "PFITS_coworkers": 15,
+    "UWES_vigor": 16,
+    "UWES_dedication": 17,
+    "UWES_absorption": 18,
+    "COPE_active": 19,
+    "COPE_support": 20,
+    "COPE_emotions": 21,
+    "balance_life_work": 22,
+    "balance_work_life": 23,
+    "recovery_experience_detachment": 24,
+    "recovery_experience_relaxation": 25,
+    "symptoms": 26,
+    "appraisal_stressfulness_event": 87,
+    "appraisal_threat": 88,
+    "appraisal_challenge": 89,
+    "appraisal_event_time": 90,
+    "appraisal_event_duration": 91,
+    "appraisal_event_work_related": 92,
+    "appraisal_stressfulness_period": 93,
+    "late_work": 94,
+    "work_hours": 95,
+    "left_work": 96,
+    "activities": 97,
+    "coffee_breaks": 98,
+    "at_work_yet": 99,
+}
+
+ESM_STATUS_ANSWERED = 2
+
+GROUP_SESSIONS_BY = ["participant_id", "device_id", "esm_session"]
+
+SESSION_STATUS_UNANSWERED = "ema_unanswered"
+SESSION_STATUS_DAY_FINISHED = "day_finished"
+SESSION_STATUS_COMPLETE = "ema_completed"
+
+ANSWER_DAY_FINISHED = "DayFinished3421"
+ANSWER_DAY_OFF = "DayOff3421"
+ANSWER_SET_EVENING = "DayFinishedSetEvening"
+
+MAX_MORNING_LENGTH = 3
+# When the participants was not yet at work at the time of the first (morning) EMA,
+# only three items were answered.
+# Two sleep related items and one indicating NOT starting work yet.
+# Daytime EMAs are all longer, in fact they always consist of at least 6 items.
+
+
+def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
+    """
+    Convert timestamps into human-readable datetimes and dates
+    and expand the JSON column into several Pandas DF columns.
+
+    Parameters
+    ----------
+    df_esm: pd.DataFrame
+        A dataframe of esm data.
+
+    Returns
+    -------
+    df_esm_preprocessed: pd.DataFrame
+        A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
+    """
+    df_esm_json = df_esm["esm_json"].apply(json.loads)
+    df_esm_json = pd.json_normalize(df_esm_json).drop(
+        columns=["esm_trigger"]
+    )  # The esm_trigger column is already present in the main df.
+    return df_esm.join(df_esm_json)
+
+
+def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
+    """
+    This function eliminates invalid ESM responses.
+    It removes unanswered ESMs and those that indicate end of work and similar.
+    It also extracts a numeric answer from strings such as "4 - I strongly agree".
+
+    Parameters
+    ----------
+    df_esm_preprocessed: pd.DataFrame
+        A preprocessed dataframe of esm data.
+
+    Returns
+    -------
+    df_esm_clean: pd.DataFrame
+        A subset of the original dataframe.
+
+    """
+    df_esm_clean = df_esm_preprocessed[
+        df_esm_preprocessed["esm_status"] == ESM_STATUS_ANSWERED
+    ]
+    df_esm_clean = df_esm_clean[
+        ~df_esm_clean["esm_user_answer"].isin(
+            [ANSWER_DAY_FINISHED, ANSWER_DAY_OFF, ANSWER_SET_EVENING]
+        )
+    ]
+    df_esm_clean["esm_user_answer_numeric"] = np.nan
+    esm_type_numeric = [
+        ESM_TYPE.get("radio"),
+        ESM_TYPE.get("scale"),
+        ESM_TYPE.get("number"),
+    ]
+    df_esm_clean.loc[
+        df_esm_clean["esm_type"].isin(esm_type_numeric)
+    ] = df_esm_clean.loc[df_esm_clean["esm_type"].isin(esm_type_numeric)].assign(
+        esm_user_answer_numeric=lambda x: x.esm_user_answer.str.slice(stop=1).astype(
+            int
+        )
+    )
+    return df_esm_clean
--- a/src/features/phone_esm/straw/main.py
+++ b/src/features/phone_esm/straw/main.py
@ -0,0 +1,63 @@
+import pandas as pd
+
+QUESTIONNAIRE_IDS = {
+    "sleep_quality": 1,
+    "PANAS_positive_affect": 8,
+    "PANAS_negative_affect": 9,
+    "JCQ_job_demand": 10,
+    "JCQ_job_control": 11,
+    "JCQ_supervisor_support": 12,
+    "JCQ_coworker_support": 13,
+    "PFITS_supervisor": 14,
+    "PFITS_coworkers": 15,
+    "UWES_vigor": 16,
+    "UWES_dedication": 17,
+    "UWES_absorption": 18,
+    "COPE_active": 19,
+    "COPE_support": 20,
+    "COPE_emotions": 21,
+    "balance_life_work": 22,
+    "balance_work_life": 23,
+    "recovery_experience_detachment": 24,
+    "recovery_experience_relaxation": 25,
+    "symptoms": 26,
+    "appraisal_stressfulness_event": 87,
+    "appraisal_threat": 88,
+    "appraisal_challenge": 89,
+    "appraisal_event_time": 90,
+    "appraisal_event_duration": 91,
+    "appraisal_event_work_related": 92,
+    "appraisal_stressfulness_period": 93,
+    "late_work": 94,
+    "work_hours": 95,
+    "left_work": 96,
+    "activities": 97,
+    "coffee_breaks": 98,
+    "at_work_yet": 99,
+}
+
+
+def straw_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
+    esm_data = pd.read_csv(sensor_data_files["sensor_data"])
+    requested_features = provider["FEATURES"]
+    # name of the features this function can compute
+    requested_scales = provider["SCALES"]
+    base_features_names = ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support"]
+    #TODO Check valid questionnaire and feature names.
+    # the subset of requested features this function can compute
+    features_to_compute = list(set(requested_features) & set(base_features_names))
+    esm_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
+    if not esm_data.empty:
+        esm_data = filter_data_by_segment(esm_data, time_segment)
+
+        if not esm_data.empty:
+            esm_features = pd.DataFrame()
+
+            for scale in requested_scales:
+                questionnaire_id = QUESTIONNAIRE_IDS[scale]
+                mask = esm_data["questionnaire_id"] == questionnaire_id
+                esm_features[scale + "_mean"] = esm_data.loc[mask].groupby(["local_segment"])["esm_user_score"].mean()
+                #TODO Create the column esm_user_score in esm_clean. Currently, this is only done when reversing.
+
+            esm_features = esm_features.reset_index()
+    return esm_features
--- a/src/features/phone_esm/straw/preprocess.py
+++ b/src/features/phone_esm/straw/preprocess.py
@ -0,0 +1,25 @@
+from esm_preprocess import *
+from esm_JCQ import reverse_jcq_demand_control_scoring
+
+requested_scales = snakemake.params["scales"]
+
+df_esm = pd.read_csv(snakemake.input[0])
+df_esm_preprocessed = preprocess_esm(df_esm)
+
+if not all([scale in QUESTIONNAIRE_IDS for scale in requested_scales]):
+    unknown_scales = set(requested_scales) - set(QUESTIONNAIRE_IDS.keys())
+    print("The requested questionnaire name should be one of the following:")
+    print(QUESTIONNAIRE_IDS.keys())
+    raise ValueError("You requested scales not collected: ", unknown_scales)
+
+df_esm_clean = clean_up_esm(df_esm_preprocessed)
+df_esm_clean["esm_user_score"] = df_esm_clean["esm_user_answer_numeric"]
+
+for scale in requested_scales:
+    questionnaire_id = QUESTIONNAIRE_IDS[scale]
+    mask = df_esm_clean["questionnaire_id"] == questionnaire_id
+    if scale.startswith("JCQ"):
+        df_esm_clean.loc[mask] = reverse_jcq_demand_control_scoring(df_esm_clean.loc[mask])
+    #TODO Reverse other questionnaires if needed and/or adapt esm_user_score to original scoring.
+
+df_esm_clean.to_csv(snakemake.output[0], index=False)
--- a/src/features/phone_locations/barnett/daily_features.R
+++ b/src/features/phone_locations/barnett/daily_features.R
@ -25,9 +25,11 @@ barnett_daily_features <- function(snakemake){
  datetime_end_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 23:59:59"
  location <- location %>% 
    mutate(is_daily = str_detect(assigned_segments, paste0(".*#", datetime_start_regex, ",", datetime_end_regex, ".*")))
-  
-  if(nrow(segment_labels) == 0 || nrow(location) == 0 || all(location$is_daily == FALSE) || (max(location$timestamp) - min(location$timestamp) < 86400000)){
-    warning("Barnett's location features cannot be computed for data or time segments that do not span one or more entire days (00:00:00 to 23:59:59). Values below point to the problem:",
+
+  does_not_span = nrow(segment_labels) == 0 || nrow(location) == 0 || all(location$is_daily == FALSE) || (max(location$timestamp) - min(location$timestamp) < 86400000)
+
+  if(is.na(does_not_span) || does_not_span){
+      warning("Barnett's location features cannot be computed for data or time segments that do not span one or more entire days (00:00:00 to 23:59:59). Values below point to the problem:",
            "\nLocation data rows within a daily time segment: ", nrow(filter(location, is_daily)),
            "\nLocation data time span in days: ", round((max(location$timestamp) - min(location$timestamp)) / 86400000, 2)
            )
--- a/src/models/helper.py
+++ b/src/models/helper.py
@ -0,0 +1,18 @@
+import pandas as pd
+
+
+def retain_target_column(df_input: pd.DataFrame, target_variable_name: str):
+    column_names = df_input.columns
+    esm_names_index = column_names.str.startswith("phone_esm_straw")
+    # Find all columns coming from phone_esm, since these are not features for our purposes and we will drop them.
+    esm_names = column_names[esm_names_index]
+    target_variable_index = esm_names.str.contains(target_variable_name)
+    if all(~target_variable_index):
+        raise ValueError("The requested target (", target_variable_name,
+                         ")cannot be found in the dataset.",
+                         "Please check the names of phone_esm_ columns in all_sensor_features_cleaned_rapids.csv")
+    sensor_features_plus_target = df_input.drop(esm_names, axis=1)
+    sensor_features_plus_target["target"] = df_input[esm_names[target_variable_index]]
+    # We will only keep one column related to phone_esm and that will be our target variable.
+    # Add it back to the very and of the data frame and rename it to target.
+    return sensor_features_plus_target
--- a/src/models/merge_features_and_targets_for_population_model.py
+++ b/src/models/merge_features_and_targets_for_population_model.py
@ -0,0 +1,20 @@
+import pandas as pd
+
+from helper import retain_target_column
+
+sensor_features = pd.read_csv(snakemake.input["cleaned_sensor_features"])
+
+all_baseline_features = pd.DataFrame()
+for baseline_features_path in snakemake.input["demographic_features"]:
+    pid = baseline_features_path.split("/")[3]
+    baseline_features = pd.read_csv(baseline_features_path)
+    baseline_features = baseline_features.assign(pid=pid)
+    all_baseline_features = pd.concat([all_baseline_features, baseline_features], axis=0)
+
+# merge sensor features and baseline features
+features = sensor_features.merge(all_baseline_features, on="pid", how="left")
+
+target_variable_name = snakemake.params["target_variable"]
+model_input = retain_target_column(features, target_variable_name)
+
+model_input.to_csv(snakemake.output[0], index=False)
--- a/src/models/select_targets.py
+++ b/src/models/select_targets.py
@ -0,0 +1,11 @@
+import pandas as pd
+
+from helper import retain_target_column
+
+cleaned_sensor_features = pd.read_csv(snakemake.input["cleaned_sensor_features"])
+target_variable_name = snakemake.params["target_variable"]
+
+model_input = retain_target_column(cleaned_sensor_features, target_variable_name)
+model_input.dropna(axis ="index", how="any", subset=["target"], inplace=True)
+
+model_input.to_csv(snakemake.output[0], index=False)
--- a/src/visualization/heatmap_sensors_per_minute_per_time_segment.py
+++ b/src/visualization/heatmap_sensors_per_minute_per_time_segment.py
@ -24,12 +24,12 @@ def colors2colorscale(colors):
 def getDataForPlot(phone_data_yield_per_segment):
    # calculate the length (in minute) of per segment instance
    phone_data_yield_per_segment["length"] = phone_data_yield_per_segment["timestamps_segment"].str.split(",").apply(lambda x: int((int(x[1])-int(x[0])) / (1000 * 60)))
-    # calculate the number of sensors logged at least one row of data per minute.
-    phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(["local_segment", "length", "local_date", "local_hour", "local_minute"])[["sensor", "local_date_time"]].max().reset_index()
    # extract local start datetime of the segment from "local_segment" column
    phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime(phone_data_yield_per_segment["local_segment"].apply(lambda x: x.split("#")[1].split(",")[0]))
    # calculate the number of minutes after local start datetime of the segment
    phone_data_yield_per_segment["minutes_after_segment_start"] = ((phone_data_yield_per_segment["local_date_time"] - phone_data_yield_per_segment["local_segment_start_datetimes"]) / pd.Timedelta(minutes=1)).astype("int")
+    # calculate the number of sensors logged at least one row of data per minute.
+    phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(["local_segment", "length", "local_segment_start_datetimes", "minutes_after_segment_start"])[["sensor"]].max().reset_index()
    
    # impute missing rows with 0
    columns_for_full_index = phone_data_yield_per_segment[["local_segment_start_datetimes", "length"]].drop_duplicates(keep="first")
Author	SHA1	Message	Date
junos	ed5314aa98	Merge remote-tracking branch 'origin/master'	2022-04-12 17:27:25 +02:00
junos	11c64cfc1a	Include all participants again.	2022-04-12 17:20:19 +02:00
junos	a6a37c7bd9	Drop NaN targets. This mirrors INNER join in merge_features_and_targets_for_individual_model.py: data = pd.concat([sensor_features, targets[["target"]]], axis=1, join="inner")	2022-04-12 17:01:49 +02:00
junos	9f5edf1c2b	Revert "Add a rule for model baselines." The example was for a classification rather than regression problem. This reverts commit `9ab0c8f289`. # Conflicts: # rules/models.smk	2022-04-12 16:59:42 +02:00
junos	4ad261fae5	Rename baseline features AGAIN. Correct other mistakes.	2022-04-12 16:55:01 +02:00
junos	9ab0c8f289	Add a rule for model baselines. Add baselines and helper functions to main models dir.	2022-04-12 14:23:58 +02:00
junos	570d2eb656	Add the file for population model to Snakefile.	2022-04-12 14:11:40 +02:00
junos	f5688f6154	Add a rule to merge sensor and baseline features. And select target as before.	2022-04-08 15:42:04 +02:00
junos	b1f356c3f7	Extract a function to be used elsewhere.	2022-04-08 15:36:32 +02:00
junos	7ff3dcf5fc	Move and rename target variable.	2022-04-06 18:21:09 +02:00
junos	50c0defca7	Select target columns (no parsing necessary).	2022-04-06 18:16:49 +02:00
junos	ac86221662	[WIP] Add a rule to parse targets. Does nothing for now.	2022-04-06 17:47:03 +02:00
junos	baa94c4c4e	Correct additional error in feature file naming. Add the final feature file to the list in Snakefile.	2022-04-06 17:29:17 +02:00
junos	d2fbef5234	Merge branch 'labels' of https://repo.ijs.si/junoslukan/rapids into labels # Conflicts: # src/features/phone_esm/straw/preprocess.py	2022-04-05 19:28:37 +02:00
junos	d326a1b09d	Include the constant directly in main.py.	2022-04-05 19:08:43 +02:00
junos	2e545e81f0	Include feature calculations for different scales.	2022-04-05 19:05:34 +02:00
junos	cbc8ae4e03	Add necessary checks for empty data frames.	2022-04-05 18:58:09 +02:00
junos	f50a13167e	Add feature files back to Snakefile.	2022-04-05 18:37:58 +02:00
junos	e84c35a36a	Remove unnecessary parameters from preprocess_esm. And correct the newly named interim file.	2022-04-05 18:36:09 +02:00
junos	e2ce68f591	Defer creation of feature files to esm_features rule.	2022-04-05 18:30:04 +02:00
junos	751b04f3f4	Pass scale names to Snakemake correctly.	2022-04-05 18:14:37 +02:00
junos	99245afca3	Try a different approach for preprocessing ESMs. It is important that this follows generic RAPIDS pattern. In the subsequent step of calculating features, there is an expected file and folder structure of data/interim. See rules/common.smk/find_features_files()	2022-04-05 18:02:31 +02:00
junos	ed298a9479	Implement the basic feature extraction steps.	2022-04-05 15:46:02 +02:00
junos	798ec973b4	[WIP] Add a rule for ESM features.	2022-03-30 10:43:30 +02:00
junos	3af8de6235	Create feature provider script.	2022-03-30 10:40:53 +02:00
junos	7173ca13e3	Rename a parameter.	2022-03-30 10:40:53 +02:00
junos	9478dc94f2	Add an else. This is to make sure that in case the reversing fails, we do not get any output items. Snakemake will inform us of an error in this event.	2022-03-30 10:40:53 +02:00
junos	b18dba366e	Add an else. This is to make sure that in case the reversing fails, we do not get any output items. Snakemake will inform us of an error in this event.	2022-03-16 18:59:29 +01:00
junos	916bb21a53	Merge branch 'labels' into run_test_participant	2022-03-16 18:56:00 +01:00
junos	c6144f8403	Reverse JCQ items.	2022-03-16 18:55:46 +01:00
junos	fec7cc9550	Merge branch 'labels' into run_test_participant	2022-03-16 18:30:03 +01:00
junos	23f0aaba3a	Get the name of the questionnaire from Snakefile.	2022-03-16 18:28:57 +01:00
junos	8ed7d23348	Merge branch 'labels' into run_test_participant	2022-03-16 17:56:07 +01:00
junos	679f00dc19	Enable selecting any questionnaire as target.	2022-03-16 17:55:44 +01:00
junos	1374eda171	Flatten questionnaire ID dict.	2022-03-16 17:38:09 +01:00
junos	3e9cdde66e	Merge branch 'master' into run_test_participant	2022-03-16 17:27:50 +01:00
junos	155395512c	Merge branch 'labels' into run_test_participant	2022-03-16 17:09:53 +01:00
junos	cb116100dd	Move preprocessing to features.	2022-03-16 17:06:42 +01:00
junos	19b9da0ba3	Separate function definitions from main.	2022-03-16 16:49:28 +01:00
junos	83a8bb6689	Add an option to disable calculation of baseline features.	2022-03-16 15:51:12 +01:00
junos	ef57103bac	Add questionnaire ID key.	2022-03-15 13:41:33 +01:00
junos	5f293211a7	Reformat.	2022-03-15 13:28:51 +01:00
junos	d470eef27e	Add a rule to preprocess and clean ESM.	2022-03-09 18:38:46 +01:00
junos	b09522a8af	Merge branch 'labels' into run_test_participant	2022-03-09 17:58:44 +01:00
junos	d4a4bbbff0	Remove unused columns.	2022-03-09 17:58:36 +01:00
junos	085a6d144b	Add files to compute and create an empty script.	2022-03-09 17:32:02 +01:00
junos	42d62f16d0	Add RAPIDS mandatory columns for ESM.	2022-03-09 17:31:37 +01:00
junos	a159ca3d3a	Merge branch 'labels' into run_test_participant	2022-03-08 15:43:42 +01:00
junos	2bef86b1da	Add a format for ESM and add to config.	2022-03-08 15:43:25 +01:00
junos	d8e9a309f7	Rename features and write baseline_interim.	2022-03-08 15:10:36 +01:00
junos	ba7c3e620b	Merge branch 'master' into run_test_participant	2022-03-01 12:03:14 +01:00
junos	a3a4f04ffe	Setting with : produces NaNs.	2022-03-01 12:02:57 +01:00
junos	aedb8b6785	Write questionnaire data to data/interim.	2022-03-01 12:02:36 +01:00
junos	631581cc8a	Merge branch 'master' into run_test_participant	2022-03-01 11:42:19 +01:00
junos	d3ebfeeabd	Write questionnaire data to data/interim.	2022-03-01 11:42:08 +01:00
junos	70e077f6ab	Merge branch 'master' into run_test_participant	2022-03-01 11:40:17 +01:00
junos	f13a91044d	Write questionnaire data to data/interim.	2022-03-01 11:39:58 +01:00
junos	b5a6317f4b	Calculate JCQ control and demand control ratio. Include norms and corresponding quartile.	2022-02-28 18:51:47 +01:00
junos	2fed962644	Calculate JCQ demand score. Hardcode question IDs to be reversed.	2022-02-28 18:30:41 +01:00
junos	30ac8b1cd5	Start calculating demand control features.	2022-02-23 19:08:10 +01:00
junos	9a74e74d08	Add the baseline features rule to snakefile. Correct age calculation for a single value instead of dataframe.	2022-02-23 18:15:26 +01:00
junos	43e5ac7918	Merge branch 'master' into run_test_participant	2022-02-23 18:06:07 +01:00
junos	07da6be398	Add age, gender, and language as features. Move calculation of age from merge_baseline_data.py to baseline_features.py.	2022-02-23 18:05:23 +01:00
junos	c801f66533	Retain a single participant ID. Do not plot heatmaps as this is bugged.	2022-02-23 11:10:54 +01:00
junos	176367631b	Prepare baseline feature rule.	2022-02-23 11:09:33 +01:00
Meng Li	28e580e597	Update change-log for v1.8.0	2022-02-10 15:05:55 -05:00
Meng Li	463ac0a2aa	Fix bug#169 (#174 )	2022-01-27 11:27:32 -05:00
Sam	10e896ca1d	Add data stream for AWARE Micro server (#173 ) * Add data stream for AWARE Micro server * Fix one documentation typo and one ommission	2022-01-27 10:47:50 -05:00
Sam	e5dbbfce44	Avoid NA problem in barnett location evaluation (#172 ) * Avoid occasional issue where does_not_span evaluates to NA, which breaks the if() * Restored original warning	2022-01-18 10:16:37 -05:00
Sam	8ae26fb845	Fixes issue where 'duration' in the 'ios_calls' dataframe is seen as a character type. (#171 )	2022-01-18 10:15:53 -05:00