Compare commits
No commits in common. "ed5314aa98accac305560f38fc1360ab0407e4bf" and "bf9c764c97f076f4af288f7afa1a32931996b2db" have entirely different histories.
ed5314aa98
...
bf9c764c97
26
Snakefile
26
Snakefile
|
@ -164,16 +164,6 @@ for provider in config["PHONE_CONVERSATION"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
|
||||
for provider in config["PHONE_ESM"]["PROVIDERS"].keys():
|
||||
if config["PHONE_ESM"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
files_to_compute.extend(expand("data/raw/{pid}/phone_esm_raw.csv",pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/raw/{pid}/phone_esm_with_datetime.csv",pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/interim/{pid}/phone_esm_clean.csv",pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/interim/{pid}/phone_esm_features/phone_esm_{language}_{provider_key}.csv",pid=config["PIDS"],language=get_script_language(config["PHONE_ESM"]["PROVIDERS"][provider]["SRC_SCRIPT"]),provider_key=provider.lower()))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_esm.csv", pid=config["PIDS"]))
|
||||
#files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv",pid=config["PIDS"]))
|
||||
#files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
|
||||
# We can delete these if's as soon as we add feature PROVIDERS to any of these sensors
|
||||
if isinstance(config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"], dict):
|
||||
for provider in config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"].keys():
|
||||
|
@ -413,19 +403,9 @@ for provider in config["ALL_CLEANING_OVERALL"]["PROVIDERS"].keys():
|
|||
if config["ALL_CLEANING_OVERALL"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
files_to_compute.extend(expand("data/processed/features/all_participants/all_sensor_features_cleaned_" + provider.lower() +".csv"))
|
||||
|
||||
# Baseline features
|
||||
if config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["COMPUTE"]:
|
||||
files_to_compute.extend(expand("data/raw/baseline_merged.csv"))
|
||||
files_to_compute.extend(expand("data/raw/{pid}/participant_baseline_raw.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/interim/{pid}/baseline_questionnaires.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/baseline_features.csv", pid=config["PIDS"]))
|
||||
|
||||
# Targets (labels)
|
||||
if config["PARAMS_FOR_ANALYSIS"]["TARGET"]["COMPUTE"]:
|
||||
files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/input.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/models/population_model/input.csv"))
|
||||
|
||||
#files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/output_{cv_method}/baselines.csv", pid=config["PIDS"], cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"]))
|
||||
# Demographic features
|
||||
files_to_compute.extend(expand("data/raw/baseline_merged.csv"))
|
||||
files_to_compute.extend(expand("data/raw/{pid}/participant_baseline_raw.csv", pid=config["PIDS"]))
|
||||
|
||||
rule all:
|
||||
input:
|
||||
|
|
19
config.yaml
19
config.yaml
|
@ -234,15 +234,6 @@ PHONE_DATA_YIELD:
|
|||
MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1, minimum percentage of valid minutes in an hour to be considered valid.
|
||||
SRC_SCRIPT: src/features/phone_data_yield/rapids/main.R
|
||||
|
||||
PHONE_ESM:
|
||||
CONTAINER: esm
|
||||
PROVIDERS:
|
||||
STRAW:
|
||||
COMPUTE: True
|
||||
SCALES: ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support"]
|
||||
FEATURES: [mean]
|
||||
SRC_SCRIPT: src/features/phone_esm/straw/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/phone-keyboard/
|
||||
PHONE_KEYBOARD:
|
||||
CONTAINER: keyboard
|
||||
|
@ -570,7 +561,7 @@ HISTOGRAM_PHONE_DATA_YIELD:
|
|||
|
||||
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#2-heatmaps-of-overall-data-yield
|
||||
HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT:
|
||||
PLOT: False
|
||||
PLOT: True
|
||||
TIME: RELATIVE_TIME # ABSOLUTE_TIME or RELATIVE_TIME
|
||||
|
||||
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#3-heatmap-of-recorded-phone-sensors
|
||||
|
@ -638,16 +629,10 @@ ALL_CLEANING_OVERALL:
|
|||
|
||||
PARAMS_FOR_ANALYSIS:
|
||||
BASELINE:
|
||||
COMPUTE: True
|
||||
FOLDER: data/external/baseline
|
||||
CONTAINER: [results-survey637813_final.csv, # Slovenia
|
||||
results-survey358134_final.csv, # Belgium 1
|
||||
results-survey413767_final.csv # Belgium 2
|
||||
]
|
||||
QUESTION_LIST: survey637813+question_text.csv
|
||||
FEATURES: [age, gender, startlanguage, limesurvey_demand, limesurvey_control, limesurvey_demand_control_ratio, limesurvey_demand_control_ratio_quartile]
|
||||
FEATURES: [age, gender]
|
||||
CATEGORICAL_FEATURES: [gender]
|
||||
|
||||
TARGET:
|
||||
COMPUTE: True
|
||||
LABEL: PANAS_negative_affect_mean
|
||||
|
|
|
@ -1,9 +1,4 @@
|
|||
# Change Log
|
||||
## v1.8.0
|
||||
- Add data stream for AWARE Micro server
|
||||
- Fix the NA bug in PHONE_LOCATIONS BARNETT provider
|
||||
- Fix the bug of data type for call_duration field
|
||||
- Fix the index bug of heatmap_sensors_per_minute_per_time_segment
|
||||
## v1.7.1
|
||||
- Update docs for Git Flow section
|
||||
- Update RAPIDS paper information
|
||||
|
|
|
@ -1,15 +0,0 @@
|
|||
# `aware_micro_mysql`
|
||||
|
||||
This [data stream](../../datastreams/data-streams-introduction) handles iOS and Android sensor data collected with the [AWARE Framework's](https://awareframework.com/) [AWARE Micro](https://github.com/denzilferreira/aware-micro) server and stored in a MySQL database.
|
||||
|
||||
## Container
|
||||
A MySQL database with a table per sensor, each containing the data for all participants. Sensor data is stored in a JSON field within each table called `data`
|
||||
|
||||
The script to connect and download data from this container is at:
|
||||
```bash
|
||||
src/data/streams/aware_micro_mysql/container.R
|
||||
```
|
||||
|
||||
## Format
|
||||
|
||||
--8<---- "docs/snippets/aware_format.md"
|
|
@ -16,7 +16,6 @@ For reference, these are the data streams we currently support:
|
|||
| Data Stream | Device | Format | Container | Docs
|
||||
|--|--|--|--|--|
|
||||
| `aware_mysql`| Phone | AWARE app | MySQL | [link](../aware-mysql)
|
||||
| `aware_micro_mysql`| Phone | AWARE Micro server | MySQL | [link](../aware-micro-mysql)
|
||||
| `aware_csv`| Phone | AWARE app | CSV files | [link](../aware-csv)
|
||||
| `aware_influxdb` (beta)| Phone | AWARE app | InfluxDB | [link](../aware-influxdb)
|
||||
| `fitbitjson_mysql`| Fitbit | JSON (per [Fitbit's API](https://dev.fitbit.com/build/reference/web-api/)) | MySQL | [link](../fitbitjson-mysql)
|
||||
|
|
|
@ -85,7 +85,6 @@ nav:
|
|||
- Introduction: datastreams/data-streams-introduction.md
|
||||
- Phone:
|
||||
- aware_mysql: datastreams/aware-mysql.md
|
||||
- aware_micro_mysql: datastreams/aware-micro-mysql.md
|
||||
- aware_csv: datastreams/aware-csv.md
|
||||
- aware_influxdb (beta): datastreams/aware-influxdb.md
|
||||
- Mandatory Phone Format: datastreams/mandatory-phone-format.md
|
||||
|
|
|
@ -324,27 +324,6 @@ rule conversation_r_features:
|
|||
script:
|
||||
"../src/features/entry.R"
|
||||
|
||||
rule preprocess_esm:
|
||||
input: "data/raw/{pid}/phone_esm_with_datetime.csv"
|
||||
params:
|
||||
scales=lambda wildcards: config["PHONE_ESM"]["PROVIDERS"]["STRAW"]["SCALES"]
|
||||
output: "data/interim/{pid}/phone_esm_clean.csv"
|
||||
script:
|
||||
"../src/features/phone_esm/straw/preprocess.py"
|
||||
|
||||
rule esm_features:
|
||||
input:
|
||||
sensor_data = "data/interim/{pid}/phone_esm_clean.csv",
|
||||
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
|
||||
params:
|
||||
provider = lambda wildcards: config["PHONE_ESM"]["PROVIDERS"][wildcards.provider_key.upper()],
|
||||
provider_key = "{provider_key}",
|
||||
sensor_key = "phone_esm",
|
||||
scales=lambda wildcards: config["PHONE_ESM"]["PROVIDERS"][wildcards.provider_key.upper()]["SCALES"]
|
||||
output: "data/interim/{pid}/phone_esm_features/phone_esm_python_{provider_key}.csv"
|
||||
script:
|
||||
"../src/features/entry.py"
|
||||
|
||||
rule phone_keyboard_python_features:
|
||||
input:
|
||||
sensor_data = "data/raw/{pid}/phone_keyboard_with_datetime.csv",
|
||||
|
|
|
@ -14,38 +14,3 @@ rule download_baseline_data:
|
|||
"data/raw/{pid}/participant_baseline_raw.csv"
|
||||
script:
|
||||
"../src/data/download_baseline_data.py"
|
||||
|
||||
rule baseline_features:
|
||||
input:
|
||||
"data/raw/{pid}/participant_baseline_raw.csv"
|
||||
params:
|
||||
pid="{pid}",
|
||||
features=config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["FEATURES"],
|
||||
question_filename=config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["FOLDER"] + "/" + config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["QUESTION_LIST"]
|
||||
output:
|
||||
interim="data/interim/{pid}/baseline_questionnaires.csv",
|
||||
features="data/processed/features/{pid}/baseline_features.csv"
|
||||
script:
|
||||
"../src/data/baseline_features.py"
|
||||
|
||||
rule select_target:
|
||||
input:
|
||||
cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned_rapids.csv"
|
||||
params:
|
||||
target_variable = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["LABEL"]
|
||||
output:
|
||||
"data/processed/models/individual_model/{pid}/input.csv"
|
||||
script:
|
||||
"../src/models/select_targets.py"
|
||||
|
||||
rule merge_features_and_targets_for_population_model:
|
||||
input:
|
||||
cleaned_sensor_features = "data/processed/features/all_participants/all_sensor_features_cleaned_rapids.csv",
|
||||
demographic_features = expand("data/processed/features/{pid}/baseline_features.csv", pid=config["PIDS"]),
|
||||
params:
|
||||
target_variable=config["PARAMS_FOR_ANALYSIS"]["TARGET"]["LABEL"]
|
||||
output:
|
||||
"data/processed/models/population_model/input.csv"
|
||||
script:
|
||||
"../src/models/merge_features_and_targets_for_population_model.py"
|
||||
|
||||
|
|
|
@ -177,6 +177,7 @@ rule resample_episodes_with_datetime:
|
|||
script:
|
||||
"../src/data/datetime/readable_datetime.R"
|
||||
|
||||
|
||||
rule phone_application_categories:
|
||||
input:
|
||||
"data/raw/{pid}/phone_applications_{type}_with_datetime.csv"
|
||||
|
|
|
@ -1,179 +0,0 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
pid = snakemake.params["pid"]
|
||||
requested_features = snakemake.params["features"]
|
||||
baseline_interim = pd.DataFrame(columns=["qid", "question", "score_original", "score"])
|
||||
baseline_features = pd.DataFrame(columns=requested_features)
|
||||
question_filename = snakemake.params["question_filename"]
|
||||
|
||||
JCQ_DEMAND = "JobEisen"
|
||||
JCQ_CONTROL = "JobControle"
|
||||
|
||||
dict_JCQ_demand_control_reverse = {
|
||||
JCQ_DEMAND: {
|
||||
3: " [Od mene se ne zahteva,",
|
||||
4: " [Imam dovolj časa, da končam",
|
||||
5: " [Pri svojem delu se ne srečujem s konfliktnimi",
|
||||
},
|
||||
JCQ_CONTROL: {
|
||||
2: " |Moje delo vključuje veliko ponavljajočega",
|
||||
6: " [Pri svojem delu imam zelo malo svobode",
|
||||
},
|
||||
}
|
||||
|
||||
LIMESURVEY_JCQ_MIN = 1
|
||||
LIMESURVEY_JCQ_MAX = 4
|
||||
|
||||
DEMAND_CONTROL_RATIO_MIN = 5 / (9 * 4)
|
||||
DEMAND_CONTROL_RATIO_MAX = (4 * 5) / 9
|
||||
|
||||
JCQ_NORMS = {
|
||||
"F": {
|
||||
0: DEMAND_CONTROL_RATIO_MIN,
|
||||
1: 0.45,
|
||||
2: 0.52,
|
||||
3: 0.62,
|
||||
4: DEMAND_CONTROL_RATIO_MAX,
|
||||
},
|
||||
"M": {
|
||||
0: DEMAND_CONTROL_RATIO_MIN,
|
||||
1: 0.41,
|
||||
2: 0.48,
|
||||
3: 0.56,
|
||||
4: DEMAND_CONTROL_RATIO_MAX,
|
||||
},
|
||||
}
|
||||
|
||||
participant_info = pd.read_csv(snakemake.input[0], parse_dates=["date_of_birth"])
|
||||
|
||||
if not participant_info.empty:
|
||||
if "age" in requested_features:
|
||||
now = pd.Timestamp("now")
|
||||
baseline_features.loc[0, "age"] = (
|
||||
now - participant_info.loc[0, "date_of_birth"]
|
||||
).days / 365.25245
|
||||
if "gender" in requested_features:
|
||||
baseline_features.loc[0, "gender"] = participant_info.loc[0, "gender"]
|
||||
if "startlanguage" in requested_features:
|
||||
baseline_features.loc[0, "startlanguage"] = participant_info.loc[
|
||||
0, "startlanguage"
|
||||
]
|
||||
if (
|
||||
("limesurvey_demand" in requested_features)
|
||||
or ("limesurvey_control" in requested_features)
|
||||
or ("limesurvey_demand_control_ratio" in requested_features)
|
||||
):
|
||||
participant_info_t = participant_info.T
|
||||
rows_baseline = participant_info_t.index
|
||||
|
||||
if ("limesurvey_demand" in requested_features) or (
|
||||
"limesurvey_demand_control_ratio" in requested_features
|
||||
):
|
||||
# Find questions about demand, but disregard time (duration of filling in questionnaire)
|
||||
rows_demand = rows_baseline.str.startswith(
|
||||
JCQ_DEMAND
|
||||
) & ~rows_baseline.str.endswith("Time")
|
||||
limesurvey_demand = (
|
||||
participant_info_t[rows_demand]
|
||||
.reset_index()
|
||||
.rename(columns={"index": "question", 0: "score_original"})
|
||||
)
|
||||
# Extract question IDs from names such as JobEisen[3]
|
||||
limesurvey_demand["qid"] = (
|
||||
limesurvey_demand["question"].str.extract(r"\[(\d+)\]").astype(int)
|
||||
)
|
||||
limesurvey_demand["score"] = limesurvey_demand["score_original"]
|
||||
# Identify rows that include questions to be reversed.
|
||||
rows_demand_reverse = limesurvey_demand["qid"].isin(
|
||||
dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys()
|
||||
)
|
||||
# Reverse the score, so that the maximum value becomes the minimum etc.
|
||||
limesurvey_demand.loc[rows_demand_reverse, "score"] = (
|
||||
LIMESURVEY_JCQ_MAX
|
||||
+ LIMESURVEY_JCQ_MIN
|
||||
- limesurvey_demand.loc[rows_demand_reverse, "score_original"]
|
||||
)
|
||||
baseline_interim = pd.concat([baseline_interim, limesurvey_demand], axis=0, ignore_index=True)
|
||||
if "demand" in requested_features:
|
||||
baseline_features.loc[0, "limesurvey_demand"] = limesurvey_demand[
|
||||
"score"
|
||||
].sum()
|
||||
|
||||
if ("limesurvey_control" in requested_features) or (
|
||||
"limesurvey_demand_control_ratio" in requested_features
|
||||
):
|
||||
# Find questions about control, but disregard time (duration of filling in questionnaire)
|
||||
rows_control = rows_baseline.str.startswith(
|
||||
JCQ_CONTROL
|
||||
) & ~rows_baseline.str.endswith("Time")
|
||||
limesurvey_control = (
|
||||
participant_info_t[rows_control]
|
||||
.reset_index()
|
||||
.rename(columns={"index": "question", 0: "score_original"})
|
||||
)
|
||||
# Extract question IDs from names such as JobControle[3]
|
||||
limesurvey_control["qid"] = (
|
||||
limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
|
||||
)
|
||||
limesurvey_control["score"] = limesurvey_control["score_original"]
|
||||
# Identify rows that include questions to be reversed.
|
||||
rows_control_reverse = limesurvey_control["qid"].isin(
|
||||
dict_JCQ_demand_control_reverse[JCQ_CONTROL].keys()
|
||||
)
|
||||
# Reverse the score, so that the maximum value becomes the minimum etc.
|
||||
limesurvey_control.loc[rows_control_reverse, "score"] = (
|
||||
LIMESURVEY_JCQ_MAX
|
||||
+ LIMESURVEY_JCQ_MIN
|
||||
- limesurvey_control.loc[rows_control_reverse, "score_original"]
|
||||
)
|
||||
|
||||
baseline_interim = pd.concat([baseline_interim, limesurvey_control], axis=0, ignore_index=True)
|
||||
|
||||
if "limesurvey_control" in requested_features:
|
||||
baseline_features.loc[0, "limesurvey_control"] = limesurvey_control[
|
||||
"score"
|
||||
].sum()
|
||||
|
||||
if "limesurvey_demand_control_ratio" in requested_features:
|
||||
limesurvey_demand_control_ratio = (
|
||||
limesurvey_demand["score"].sum() / limesurvey_control["score"].sum()
|
||||
)
|
||||
if (
|
||||
JCQ_NORMS[participant_info.loc[0, "gender"]][0]
|
||||
<= limesurvey_demand_control_ratio
|
||||
< JCQ_NORMS[participant_info.loc[0, "gender"]][1]
|
||||
):
|
||||
limesurvey_quartile = 1
|
||||
elif (
|
||||
JCQ_NORMS[participant_info.loc[0, "gender"]][1]
|
||||
<= limesurvey_demand_control_ratio
|
||||
< JCQ_NORMS[participant_info.loc[0, "gender"]][2]
|
||||
):
|
||||
limesurvey_quartile = 2
|
||||
elif (
|
||||
JCQ_NORMS[participant_info.loc[0, "gender"]][2]
|
||||
<= limesurvey_demand_control_ratio
|
||||
< JCQ_NORMS[participant_info.loc[0, "gender"]][3]
|
||||
):
|
||||
limesurvey_quartile = 3
|
||||
elif (
|
||||
JCQ_NORMS[participant_info.loc[0, "gender"]][3]
|
||||
<= limesurvey_demand_control_ratio
|
||||
< JCQ_NORMS[participant_info.loc[0, "gender"]][4]
|
||||
):
|
||||
limesurvey_quartile = 4
|
||||
else:
|
||||
limesurvey_quartile = np.nan
|
||||
|
||||
baseline_features.loc[
|
||||
0, "limesurvey_demand_control_ratio"
|
||||
] = limesurvey_demand_control_ratio
|
||||
baseline_features.loc[
|
||||
0, "limesurvey_demand_control_ratio_quartile"
|
||||
] = limesurvey_quartile
|
||||
|
||||
if not baseline_interim.empty:
|
||||
baseline_interim.to_csv(snakemake.output["interim"], index=False, encoding="utf-8")
|
||||
|
||||
baseline_features.to_csv(snakemake.output["features"], index=False, encoding="utf-8")
|
|
@ -24,6 +24,10 @@ baseline = (
|
|||
)
|
||||
|
||||
baseline.rename(columns=VARIABLES_TO_TRANSLATE, copy=False, inplace=True)
|
||||
now = pd.Timestamp("now")
|
||||
baseline = baseline.assign(
|
||||
age=lambda x: (now - x.date_of_birth).dt.days / 365.25245,
|
||||
)
|
||||
|
||||
baseline.to_csv(snakemake.output[0],
|
||||
index=False,
|
||||
|
|
|
@ -1,85 +0,0 @@
|
|||
# if you need a new package, you should add it with renv::install(package) so your renv venv is updated
|
||||
library(RMariaDB)
|
||||
library(yaml)
|
||||
|
||||
#' @description
|
||||
#' Auxiliary function to parse the connection credentials from a specifc group in ./credentials.yaml
|
||||
#' You can reause most of this function if you are connection to a DB or Web API.
|
||||
#' It's OK to delete this function if you don't need credentials, e.g., you are pulling data from a CSV for example.
|
||||
#' @param group the yaml key containing the credentials to connect to a database
|
||||
#' @preturn dbEngine a database engine (connection) ready to perform queries
|
||||
get_db_engine <- function(group){
|
||||
# The working dir is aways RAPIDS root folder, so your credentials file is always /credentials.yaml
|
||||
credentials <- read_yaml("./credentials.yaml")
|
||||
if(!group %in% names(credentials))
|
||||
stop(paste("The credentials group",group, "does not exist in ./credentials.yaml. The only groups that exist in that file are:", paste(names(credentials), collapse = ","), ". Did you forget to set the group in [PHONE_DATA_STREAMS][aware_mysql][DATABASE_GROUP] in config.yaml?"))
|
||||
dbEngine <- dbConnect(MariaDB(), db = credentials[[group]][["database"]],
|
||||
username = credentials[[group]][["user"]],
|
||||
password = credentials[[group]][["password"]],
|
||||
host = credentials[[group]][["host"]],
|
||||
port = credentials[[group]][["port"]])
|
||||
return(dbEngine)
|
||||
}
|
||||
|
||||
# This file gets executed for each PHONE_SENSOR of each participant
|
||||
# If you are connecting to a database the env file containing its credentials is available at "./.env"
|
||||
# If you are reading a CSV file instead of a DB table, the @param sensor_container wil contain the file path as set in config.yaml
|
||||
# You are not bound to databases or files, you can query a web API or whatever data source you need.
|
||||
|
||||
#' @description
|
||||
#' RAPIDS allows users to use the keyword "infer" (previously "multiple") to automatically infer the mobile Operative System a device was running.
|
||||
#' If you have a way to infer the OS of a device ID, implement this function. For example, for AWARE data we use the "aware_device" table.
|
||||
#'
|
||||
#' If you don't have a way to infer the OS, call stop("Error Message") so other users know they can't use "infer" or the inference failed,
|
||||
#' and they have to assign the OS manually in the participant file
|
||||
#'
|
||||
#' @param stream_parameters The PHONE_STREAM_PARAMETERS key in config.yaml. If you need specific parameters add them there.
|
||||
#' @param device A device ID string
|
||||
#' @return The OS the device ran, "android" or "ios"
|
||||
|
||||
infer_device_os <- function(stream_parameters, device){
|
||||
dbEngine <- get_db_engine(stream_parameters$DATABASE_GROUP)
|
||||
query <- paste0("SELECT device_id,brand FROM aware_device WHERE device_id = '", device, "'")
|
||||
message(paste0("Executing the following query to infer phone OS: ", query))
|
||||
os <- dbGetQuery(dbEngine, query)
|
||||
dbDisconnect(dbEngine)
|
||||
|
||||
if(nrow(os) > 0)
|
||||
return(os %>% mutate(os = ifelse(brand == "iPhone", "ios", "android")) %>% pull(os))
|
||||
else
|
||||
stop(paste("We cannot infer the OS of the following device id because it does not exist in the aware_device table:", device))
|
||||
|
||||
return(os)
|
||||
}
|
||||
|
||||
#' @description
|
||||
#' Gets the sensor data for a specific device id from a database table, file or whatever source you want to query
|
||||
#'
|
||||
#' @param stream_parameters The PHONE_STREAM_PARAMETERS key in config.yaml. If you need specific parameters add them there.
|
||||
#' @param device A device ID string
|
||||
#' @param sensor_container database table or file containing the sensor data for all participants. This is the PHONE_SENSOR[CONTAINER] key in config.yaml
|
||||
#' @param columns the columns needed from this sensor (we recommend to only return these columns instead of every column in sensor_container)
|
||||
#' @return A dataframe with the sensor data for device
|
||||
|
||||
pull_data <- function(stream_parameters, device, sensor, sensor_container, columns){
|
||||
dbEngine <- get_db_engine(stream_parameters$DATABASE_GROUP)
|
||||
|
||||
select_items <- c()
|
||||
for (column in columns) {
|
||||
select_items <- append(select_items, paste0("data->>'$.", column, "' ", column))
|
||||
}
|
||||
|
||||
query <- paste0("SELECT ", paste(select_items, collapse = ",")," FROM ", sensor_container, " WHERE ", columns$DEVICE_ID ," = '", device,"'")
|
||||
|
||||
# Letting the user know what we are doing
|
||||
message(paste0("Executing the following query to download data: ", query))
|
||||
sensor_data <- dbGetQuery(dbEngine, query)
|
||||
|
||||
dbDisconnect(dbEngine)
|
||||
|
||||
if(nrow(sensor_data) == 0)
|
||||
warning(paste("The device '", device,"' did not have data in ", sensor_container))
|
||||
|
||||
return(sensor_data)
|
||||
}
|
||||
|
|
@ -1,337 +0,0 @@
|
|||
PHONE_ACCELEROMETER:
|
||||
ANDROID:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
DOUBLE_VALUES_0: double_values_0
|
||||
DOUBLE_VALUES_1: double_values_1
|
||||
DOUBLE_VALUES_2: double_values_2
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
IOS:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
DOUBLE_VALUES_0: double_values_0
|
||||
DOUBLE_VALUES_1: double_values_1
|
||||
DOUBLE_VALUES_2: double_values_2
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
|
||||
PHONE_ACTIVITY_RECOGNITION:
|
||||
ANDROID:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
ACTIVITY_NAME: activity_name
|
||||
ACTIVITY_TYPE: activity_type
|
||||
CONFIDENCE: confidence
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
IOS:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
ACTIVITY_NAME: FLAG_TO_MUTATE
|
||||
ACTIVITY_TYPE: FLAG_TO_MUTATE
|
||||
CONFIDENCE: FLAG_TO_MUTATE
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
ACTIVITIES: activities
|
||||
CONFIDENCE: confidence
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
- "src/data/streams/mutations/phone/aware/activity_recogniton_ios_unification.R"
|
||||
|
||||
PHONE_APPLICATIONS_CRASHES:
|
||||
ANDROID:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
PACKAGE_NAME: package_name
|
||||
APPLICATION_NAME: application_name
|
||||
APPLICATION_VERSION: application_version
|
||||
ERROR_SHORT: error_short
|
||||
ERROR_LONG: error_long
|
||||
ERROR_CONDITION: error_condition
|
||||
IS_SYSTEM_APP: is_system_app
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
|
||||
PHONE_APPLICATIONS_FOREGROUND:
|
||||
ANDROID:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
PACKAGE_NAME: package_name
|
||||
APPLICATION_NAME: application_name
|
||||
IS_SYSTEM_APP: is_system_app
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
|
||||
PHONE_APPLICATIONS_NOTIFICATIONS:
|
||||
ANDROID:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
PACKAGE_NAME: package_name
|
||||
APPLICATION_NAME: application_name
|
||||
TEXT: text
|
||||
SOUND: sound
|
||||
VIBRATE: vibrate
|
||||
DEFAULTS: defaults
|
||||
FLAGS: flags
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
|
||||
PHONE_BATTERY:
|
||||
ANDROID:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
BATTERY_STATUS: battery_status
|
||||
BATTERY_LEVEL: battery_level
|
||||
BATTERY_SCALE: battery_scale
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
IOS:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
BATTERY_STATUS: FLAG_TO_MUTATE
|
||||
BATTERY_LEVEL: battery_level
|
||||
BATTERY_SCALE: battery_scale
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
BATTERY_STATUS: battery_status
|
||||
SCRIPTS:
|
||||
- "src/data/streams/mutations/phone/aware/battery_ios_unification.R"
|
||||
|
||||
PHONE_BLUETOOTH:
|
||||
ANDROID:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
BT_ADDRESS: bt_address
|
||||
BT_NAME: bt_name
|
||||
BT_RSSI: bt_rssi
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
IOS:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
BT_ADDRESS: bt_address
|
||||
BT_NAME: bt_name
|
||||
BT_RSSI: bt_rssi
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
|
||||
PHONE_CALLS:
|
||||
ANDROID:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
CALL_TYPE: call_type
|
||||
CALL_DURATION: call_duration
|
||||
TRACE: trace
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
IOS:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
CALL_TYPE: FLAG_TO_MUTATE
|
||||
CALL_DURATION: call_duration
|
||||
TRACE: trace
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
CALL_TYPE: call_type
|
||||
SCRIPTS:
|
||||
- "src/data/streams/mutations/phone/aware/calls_ios_unification.R"
|
||||
|
||||
PHONE_CONVERSATION:
|
||||
ANDROID:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
DOUBLE_ENERGY: double_energy
|
||||
INFERENCE: inference
|
||||
DOUBLE_CONVO_START: double_convo_start
|
||||
DOUBLE_CONVO_END: double_convo_end
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
IOS:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
DOUBLE_ENERGY: double_energy
|
||||
INFERENCE: inference
|
||||
DOUBLE_CONVO_START: double_convo_start
|
||||
DOUBLE_CONVO_END: double_convo_end
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
- "src/data/streams/mutations/phone/aware/conversation_ios_timestamp.R"
|
||||
|
||||
PHONE_KEYBOARD:
|
||||
ANDROID:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
PACKAGE_NAME: package_name
|
||||
BEFORE_TEXT: before_text
|
||||
CURRENT_TEXT: current_text
|
||||
IS_PASSWORD: is_password
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
|
||||
PHONE_LIGHT:
|
||||
ANDROID:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
DOUBLE_LIGHT_LUX: double_light_lux
|
||||
ACCURACY: accuracy
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
|
||||
PHONE_LOCATIONS:
|
||||
ANDROID:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
DOUBLE_LATITUDE: double_latitude
|
||||
DOUBLE_LONGITUDE: double_longitude
|
||||
DOUBLE_BEARING: double_bearing
|
||||
DOUBLE_SPEED: double_speed
|
||||
DOUBLE_ALTITUDE: double_altitude
|
||||
PROVIDER: provider
|
||||
ACCURACY: accuracy
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
IOS:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
DOUBLE_LATITUDE: double_latitude
|
||||
DOUBLE_LONGITUDE: double_longitude
|
||||
DOUBLE_BEARING: double_bearing
|
||||
DOUBLE_SPEED: double_speed
|
||||
DOUBLE_ALTITUDE: double_altitude
|
||||
PROVIDER: provider
|
||||
ACCURACY: accuracy
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
|
||||
PHONE_LOG:
|
||||
ANDROID:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
LOG_MESSAGE: log_message
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
IOS:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
LOG_MESSAGE: log_message
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
|
||||
PHONE_MESSAGES:
|
||||
ANDROID:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
MESSAGE_TYPE: message_type
|
||||
TRACE: trace
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
|
||||
PHONE_SCREEN:
|
||||
ANDROID:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
SCREEN_STATUS: screen_status
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
IOS:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
SCREEN_STATUS: FLAG_TO_MUTATE
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCREEN_STATUS: screen_status
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
- "src/data/streams/mutations/phone/aware/screen_ios_unification.R"
|
||||
|
||||
PHONE_WIFI_CONNECTED:
|
||||
ANDROID:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
MAC_ADDRESS: mac_address
|
||||
SSID: ssid
|
||||
BSSID: bssid
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
IOS:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
MAC_ADDRESS: mac_address
|
||||
SSID: ssid
|
||||
BSSID: bssid
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
|
||||
PHONE_WIFI_VISIBLE:
|
||||
ANDROID:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
SSID: ssid
|
||||
BSSID: bssid
|
||||
SECURITY: security
|
||||
FREQUENCY: frequency
|
||||
RSSI: rssi
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
IOS:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
SSID: ssid
|
||||
BSSID: bssid
|
||||
SECURITY: security
|
||||
FREQUENCY: frequency
|
||||
RSSI: rssi
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
|
|
@ -183,21 +183,6 @@ PHONE_CONVERSATION:
|
|||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
- "src/data/streams/mutations/phone/aware/conversation_ios_timestamp.R"
|
||||
|
||||
PHONE_ESM:
|
||||
ANDROID:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
TIMESTAMP: double_esm_user_answer_timestamp
|
||||
DEVICE_ID: device_id
|
||||
ESM_STATUS: esm_status
|
||||
ESM_USER_ANSWER: esm_user_answer
|
||||
ESM_JSON: esm_json
|
||||
ESM_TRIGGER: esm_trigger
|
||||
ESM_SESSION: esm_session
|
||||
ESM_NOTIFICATION_ID: esm_notification_id
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS:
|
||||
|
||||
PHONE_KEYBOARD:
|
||||
ANDROID:
|
||||
RAPIDS_COLUMN_MAPPINGS:
|
||||
|
|
|
@ -39,7 +39,7 @@ unify_ios_calls <- function(ios_calls){
|
|||
assigned_segments = first(assigned_segments))
|
||||
}
|
||||
else {
|
||||
ios_calls <- ios_calls %>% summarise(call_type_sequence = paste(call_type, collapse = ","), call_duration = sum(as.numeric(call_duration)), timestamp = first(timestamp), device_id = first(device_id))
|
||||
ios_calls <- ios_calls %>% summarise(call_type_sequence = paste(call_type, collapse = ","), call_duration = sum(call_duration), timestamp = first(timestamp), device_id = first(device_id))
|
||||
}
|
||||
ios_calls <- ios_calls %>% mutate(call_type = case_when(
|
||||
call_type_sequence == "1,2,4" | call_type_sequence == "2,1,4" ~ 1, # incoming
|
||||
|
|
|
@ -67,16 +67,6 @@ PHONE_CONVERSATION:
|
|||
- DOUBLE_CONVO_START
|
||||
- DOUBLE_CONVO_END
|
||||
|
||||
PHONE_ESM:
|
||||
- TIMESTAMP
|
||||
- DEVICE_ID
|
||||
- ESM_STATUS
|
||||
- ESM_USER_ANSWER
|
||||
- ESM_JSON
|
||||
- ESM_TRIGGER
|
||||
- ESM_SESSION
|
||||
- ESM_NOTIFICATION_ID
|
||||
|
||||
PHONE_KEYBOARD:
|
||||
- TIMESTAMP
|
||||
- DEVICE_ID
|
||||
|
|
|
@ -1,108 +0,0 @@
|
|||
import pandas as pd
|
||||
|
||||
JCQ_ORIGINAL_MAX = 4
|
||||
JCQ_ORIGINAL_MIN = 1
|
||||
|
||||
dict_JCQ_demand_control_reverse = {
|
||||
75: (
|
||||
"I was NOT asked",
|
||||
"Men legde mij geen overdreven",
|
||||
"Men legde mij GEEN overdreven", # Capitalized in some versions
|
||||
"Od mene se NI zahtevalo",
|
||||
),
|
||||
76: (
|
||||
"I had enough time to do my work",
|
||||
"Ik had voldoende tijd om mijn werk",
|
||||
"Imela sem dovolj časa, da končam",
|
||||
"Imel sem dovolj časa, da končam",
|
||||
),
|
||||
77: (
|
||||
"I was free of conflicting demands",
|
||||
"Er werden mij op het werk geen tegenstrijdige",
|
||||
"Er werden mij op het werk GEEN tegenstrijdige", # Capitalized in some versions
|
||||
"Pri svojem delu se NISEM srečeval",
|
||||
),
|
||||
79: (
|
||||
"My job involved a lot of repetitive work",
|
||||
"Mijn taak omvatte veel repetitief werk",
|
||||
"Moje delo je vključevalo veliko ponavljajočega",
|
||||
),
|
||||
85: (
|
||||
"On my job, I had very little freedom",
|
||||
"In mijn taak had ik zeer weinig vrijheid",
|
||||
"Pri svojem delu sem imel zelo malo svobode",
|
||||
"Pri svojem delu sem imela zelo malo svobode",
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def reverse_jcq_demand_control_scoring(
|
||||
df_esm_jcq_demand_control: pd.DataFrame,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
This function recodes answers in Job content questionnaire by first incrementing them by 1,
|
||||
to be in line with original (1-4) scoring.
|
||||
Then, some answers are reversed (i.e. 1 becomes 4 etc.), because the questions are negatively phrased.
|
||||
These answers are listed in dict_JCQ_demand_control_reverse and identified by their question ID.
|
||||
However, the existing data is checked against literal phrasing of these questions
|
||||
to protect against wrong numbering of questions (differing question IDs).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_esm_jcq_demand_control: pd.DataFrame
|
||||
A cleaned up dataframe, which must also include esm_user_answer_numeric.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_esm_jcq_demand_control: pd.DataFrame
|
||||
The same dataframe with a column esm_user_score containing answers recoded and reversed.
|
||||
"""
|
||||
df_esm_jcq_demand_control_unique_answers = (
|
||||
df_esm_jcq_demand_control.groupby("question_id")
|
||||
.esm_instructions.value_counts()
|
||||
.rename()
|
||||
.reset_index()
|
||||
)
|
||||
# Tabulate all possible answers to each question (group by question ID).
|
||||
for q_id in dict_JCQ_demand_control_reverse.keys():
|
||||
# Look through all answers that need to be reversed.
|
||||
possible_answers = df_esm_jcq_demand_control_unique_answers.loc[
|
||||
df_esm_jcq_demand_control_unique_answers["question_id"] == q_id,
|
||||
"esm_instructions",
|
||||
]
|
||||
# These are all answers to a given question (by q_id).
|
||||
answers_matches = possible_answers.str.startswith(
|
||||
dict_JCQ_demand_control_reverse.get(q_id)
|
||||
)
|
||||
# See if they are expected, i.e. included in the dictionary.
|
||||
if ~answers_matches.all():
|
||||
print("One of the answers that occur in the data should not be reversed.")
|
||||
print("This was the answer found in the data: ")
|
||||
raise KeyError(possible_answers[~answers_matches])
|
||||
# In case there is an unexpected answer, raise an exception.
|
||||
|
||||
try:
|
||||
df_esm_jcq_demand_control = df_esm_jcq_demand_control.assign(
|
||||
esm_user_score=lambda x: x.esm_user_answer_numeric + 1
|
||||
)
|
||||
# Increment the original answer by 1
|
||||
# to keep in line with traditional scoring (JCQ_ORIGINAL_MIN - JCQ_ORIGINAL_MAX).
|
||||
df_esm_jcq_demand_control[
|
||||
df_esm_jcq_demand_control["question_id"].isin(
|
||||
dict_JCQ_demand_control_reverse.keys()
|
||||
)
|
||||
] = df_esm_jcq_demand_control[
|
||||
df_esm_jcq_demand_control["question_id"].isin(
|
||||
dict_JCQ_demand_control_reverse.keys()
|
||||
)
|
||||
].assign(
|
||||
esm_user_score=lambda x: JCQ_ORIGINAL_MAX
|
||||
+ JCQ_ORIGINAL_MIN
|
||||
- x.esm_user_score
|
||||
)
|
||||
# Reverse the items that require it.
|
||||
except AttributeError as e:
|
||||
print("Please, clean the dataframe first using features.esm.clean_up_esm.")
|
||||
print(e)
|
||||
|
||||
return df_esm_jcq_demand_control
|
|
@ -1,135 +0,0 @@
|
|||
import json
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
ESM_TYPE = {
|
||||
"text": 1,
|
||||
"radio": 2,
|
||||
"checkbox": 3,
|
||||
"likert": 4,
|
||||
"quick_answers": 5,
|
||||
"scale": 6,
|
||||
"datetime": 7,
|
||||
"pam": 8,
|
||||
"number": 9,
|
||||
"web": 10,
|
||||
"date": 11,
|
||||
}
|
||||
|
||||
QUESTIONNAIRE_IDS = {
|
||||
"sleep_quality": 1,
|
||||
"PANAS_positive_affect": 8,
|
||||
"PANAS_negative_affect": 9,
|
||||
"JCQ_job_demand": 10,
|
||||
"JCQ_job_control": 11,
|
||||
"JCQ_supervisor_support": 12,
|
||||
"JCQ_coworker_support": 13,
|
||||
"PFITS_supervisor": 14,
|
||||
"PFITS_coworkers": 15,
|
||||
"UWES_vigor": 16,
|
||||
"UWES_dedication": 17,
|
||||
"UWES_absorption": 18,
|
||||
"COPE_active": 19,
|
||||
"COPE_support": 20,
|
||||
"COPE_emotions": 21,
|
||||
"balance_life_work": 22,
|
||||
"balance_work_life": 23,
|
||||
"recovery_experience_detachment": 24,
|
||||
"recovery_experience_relaxation": 25,
|
||||
"symptoms": 26,
|
||||
"appraisal_stressfulness_event": 87,
|
||||
"appraisal_threat": 88,
|
||||
"appraisal_challenge": 89,
|
||||
"appraisal_event_time": 90,
|
||||
"appraisal_event_duration": 91,
|
||||
"appraisal_event_work_related": 92,
|
||||
"appraisal_stressfulness_period": 93,
|
||||
"late_work": 94,
|
||||
"work_hours": 95,
|
||||
"left_work": 96,
|
||||
"activities": 97,
|
||||
"coffee_breaks": 98,
|
||||
"at_work_yet": 99,
|
||||
}
|
||||
|
||||
ESM_STATUS_ANSWERED = 2
|
||||
|
||||
GROUP_SESSIONS_BY = ["participant_id", "device_id", "esm_session"]
|
||||
|
||||
SESSION_STATUS_UNANSWERED = "ema_unanswered"
|
||||
SESSION_STATUS_DAY_FINISHED = "day_finished"
|
||||
SESSION_STATUS_COMPLETE = "ema_completed"
|
||||
|
||||
ANSWER_DAY_FINISHED = "DayFinished3421"
|
||||
ANSWER_DAY_OFF = "DayOff3421"
|
||||
ANSWER_SET_EVENING = "DayFinishedSetEvening"
|
||||
|
||||
MAX_MORNING_LENGTH = 3
|
||||
# When the participants was not yet at work at the time of the first (morning) EMA,
|
||||
# only three items were answered.
|
||||
# Two sleep related items and one indicating NOT starting work yet.
|
||||
# Daytime EMAs are all longer, in fact they always consist of at least 6 items.
|
||||
|
||||
|
||||
def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Convert timestamps into human-readable datetimes and dates
|
||||
and expand the JSON column into several Pandas DF columns.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_esm: pd.DataFrame
|
||||
A dataframe of esm data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_esm_preprocessed: pd.DataFrame
|
||||
A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
|
||||
"""
|
||||
df_esm_json = df_esm["esm_json"].apply(json.loads)
|
||||
df_esm_json = pd.json_normalize(df_esm_json).drop(
|
||||
columns=["esm_trigger"]
|
||||
) # The esm_trigger column is already present in the main df.
|
||||
return df_esm.join(df_esm_json)
|
||||
|
||||
|
||||
def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
This function eliminates invalid ESM responses.
|
||||
It removes unanswered ESMs and those that indicate end of work and similar.
|
||||
It also extracts a numeric answer from strings such as "4 - I strongly agree".
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_esm_preprocessed: pd.DataFrame
|
||||
A preprocessed dataframe of esm data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_esm_clean: pd.DataFrame
|
||||
A subset of the original dataframe.
|
||||
|
||||
"""
|
||||
df_esm_clean = df_esm_preprocessed[
|
||||
df_esm_preprocessed["esm_status"] == ESM_STATUS_ANSWERED
|
||||
]
|
||||
df_esm_clean = df_esm_clean[
|
||||
~df_esm_clean["esm_user_answer"].isin(
|
||||
[ANSWER_DAY_FINISHED, ANSWER_DAY_OFF, ANSWER_SET_EVENING]
|
||||
)
|
||||
]
|
||||
df_esm_clean["esm_user_answer_numeric"] = np.nan
|
||||
esm_type_numeric = [
|
||||
ESM_TYPE.get("radio"),
|
||||
ESM_TYPE.get("scale"),
|
||||
ESM_TYPE.get("number"),
|
||||
]
|
||||
df_esm_clean.loc[
|
||||
df_esm_clean["esm_type"].isin(esm_type_numeric)
|
||||
] = df_esm_clean.loc[df_esm_clean["esm_type"].isin(esm_type_numeric)].assign(
|
||||
esm_user_answer_numeric=lambda x: x.esm_user_answer.str.slice(stop=1).astype(
|
||||
int
|
||||
)
|
||||
)
|
||||
return df_esm_clean
|
|
@ -1,63 +0,0 @@
|
|||
import pandas as pd
|
||||
|
||||
QUESTIONNAIRE_IDS = {
|
||||
"sleep_quality": 1,
|
||||
"PANAS_positive_affect": 8,
|
||||
"PANAS_negative_affect": 9,
|
||||
"JCQ_job_demand": 10,
|
||||
"JCQ_job_control": 11,
|
||||
"JCQ_supervisor_support": 12,
|
||||
"JCQ_coworker_support": 13,
|
||||
"PFITS_supervisor": 14,
|
||||
"PFITS_coworkers": 15,
|
||||
"UWES_vigor": 16,
|
||||
"UWES_dedication": 17,
|
||||
"UWES_absorption": 18,
|
||||
"COPE_active": 19,
|
||||
"COPE_support": 20,
|
||||
"COPE_emotions": 21,
|
||||
"balance_life_work": 22,
|
||||
"balance_work_life": 23,
|
||||
"recovery_experience_detachment": 24,
|
||||
"recovery_experience_relaxation": 25,
|
||||
"symptoms": 26,
|
||||
"appraisal_stressfulness_event": 87,
|
||||
"appraisal_threat": 88,
|
||||
"appraisal_challenge": 89,
|
||||
"appraisal_event_time": 90,
|
||||
"appraisal_event_duration": 91,
|
||||
"appraisal_event_work_related": 92,
|
||||
"appraisal_stressfulness_period": 93,
|
||||
"late_work": 94,
|
||||
"work_hours": 95,
|
||||
"left_work": 96,
|
||||
"activities": 97,
|
||||
"coffee_breaks": 98,
|
||||
"at_work_yet": 99,
|
||||
}
|
||||
|
||||
|
||||
def straw_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
||||
esm_data = pd.read_csv(sensor_data_files["sensor_data"])
|
||||
requested_features = provider["FEATURES"]
|
||||
# name of the features this function can compute
|
||||
requested_scales = provider["SCALES"]
|
||||
base_features_names = ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support"]
|
||||
#TODO Check valid questionnaire and feature names.
|
||||
# the subset of requested features this function can compute
|
||||
features_to_compute = list(set(requested_features) & set(base_features_names))
|
||||
esm_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
|
||||
if not esm_data.empty:
|
||||
esm_data = filter_data_by_segment(esm_data, time_segment)
|
||||
|
||||
if not esm_data.empty:
|
||||
esm_features = pd.DataFrame()
|
||||
|
||||
for scale in requested_scales:
|
||||
questionnaire_id = QUESTIONNAIRE_IDS[scale]
|
||||
mask = esm_data["questionnaire_id"] == questionnaire_id
|
||||
esm_features[scale + "_mean"] = esm_data.loc[mask].groupby(["local_segment"])["esm_user_score"].mean()
|
||||
#TODO Create the column esm_user_score in esm_clean. Currently, this is only done when reversing.
|
||||
|
||||
esm_features = esm_features.reset_index()
|
||||
return esm_features
|
|
@ -1,25 +0,0 @@
|
|||
from esm_preprocess import *
|
||||
from esm_JCQ import reverse_jcq_demand_control_scoring
|
||||
|
||||
requested_scales = snakemake.params["scales"]
|
||||
|
||||
df_esm = pd.read_csv(snakemake.input[0])
|
||||
df_esm_preprocessed = preprocess_esm(df_esm)
|
||||
|
||||
if not all([scale in QUESTIONNAIRE_IDS for scale in requested_scales]):
|
||||
unknown_scales = set(requested_scales) - set(QUESTIONNAIRE_IDS.keys())
|
||||
print("The requested questionnaire name should be one of the following:")
|
||||
print(QUESTIONNAIRE_IDS.keys())
|
||||
raise ValueError("You requested scales not collected: ", unknown_scales)
|
||||
|
||||
df_esm_clean = clean_up_esm(df_esm_preprocessed)
|
||||
df_esm_clean["esm_user_score"] = df_esm_clean["esm_user_answer_numeric"]
|
||||
|
||||
for scale in requested_scales:
|
||||
questionnaire_id = QUESTIONNAIRE_IDS[scale]
|
||||
mask = df_esm_clean["questionnaire_id"] == questionnaire_id
|
||||
if scale.startswith("JCQ"):
|
||||
df_esm_clean.loc[mask] = reverse_jcq_demand_control_scoring(df_esm_clean.loc[mask])
|
||||
#TODO Reverse other questionnaires if needed and/or adapt esm_user_score to original scoring.
|
||||
|
||||
df_esm_clean.to_csv(snakemake.output[0], index=False)
|
|
@ -26,10 +26,8 @@ barnett_daily_features <- function(snakemake){
|
|||
location <- location %>%
|
||||
mutate(is_daily = str_detect(assigned_segments, paste0(".*#", datetime_start_regex, ",", datetime_end_regex, ".*")))
|
||||
|
||||
does_not_span = nrow(segment_labels) == 0 || nrow(location) == 0 || all(location$is_daily == FALSE) || (max(location$timestamp) - min(location$timestamp) < 86400000)
|
||||
|
||||
if(is.na(does_not_span) || does_not_span){
|
||||
warning("Barnett's location features cannot be computed for data or time segments that do not span one or more entire days (00:00:00 to 23:59:59). Values below point to the problem:",
|
||||
if(nrow(segment_labels) == 0 || nrow(location) == 0 || all(location$is_daily == FALSE) || (max(location$timestamp) - min(location$timestamp) < 86400000)){
|
||||
warning("Barnett's location features cannot be computed for data or time segments that do not span one or more entire days (00:00:00 to 23:59:59). Values below point to the problem:",
|
||||
"\nLocation data rows within a daily time segment: ", nrow(filter(location, is_daily)),
|
||||
"\nLocation data time span in days: ", round((max(location$timestamp) - min(location$timestamp)) / 86400000, 2)
|
||||
)
|
||||
|
|
|
@ -1,18 +0,0 @@
|
|||
import pandas as pd
|
||||
|
||||
|
||||
def retain_target_column(df_input: pd.DataFrame, target_variable_name: str):
|
||||
column_names = df_input.columns
|
||||
esm_names_index = column_names.str.startswith("phone_esm_straw")
|
||||
# Find all columns coming from phone_esm, since these are not features for our purposes and we will drop them.
|
||||
esm_names = column_names[esm_names_index]
|
||||
target_variable_index = esm_names.str.contains(target_variable_name)
|
||||
if all(~target_variable_index):
|
||||
raise ValueError("The requested target (", target_variable_name,
|
||||
")cannot be found in the dataset.",
|
||||
"Please check the names of phone_esm_ columns in all_sensor_features_cleaned_rapids.csv")
|
||||
sensor_features_plus_target = df_input.drop(esm_names, axis=1)
|
||||
sensor_features_plus_target["target"] = df_input[esm_names[target_variable_index]]
|
||||
# We will only keep one column related to phone_esm and that will be our target variable.
|
||||
# Add it back to the very and of the data frame and rename it to target.
|
||||
return sensor_features_plus_target
|
|
@ -1,20 +0,0 @@
|
|||
import pandas as pd
|
||||
|
||||
from helper import retain_target_column
|
||||
|
||||
sensor_features = pd.read_csv(snakemake.input["cleaned_sensor_features"])
|
||||
|
||||
all_baseline_features = pd.DataFrame()
|
||||
for baseline_features_path in snakemake.input["demographic_features"]:
|
||||
pid = baseline_features_path.split("/")[3]
|
||||
baseline_features = pd.read_csv(baseline_features_path)
|
||||
baseline_features = baseline_features.assign(pid=pid)
|
||||
all_baseline_features = pd.concat([all_baseline_features, baseline_features], axis=0)
|
||||
|
||||
# merge sensor features and baseline features
|
||||
features = sensor_features.merge(all_baseline_features, on="pid", how="left")
|
||||
|
||||
target_variable_name = snakemake.params["target_variable"]
|
||||
model_input = retain_target_column(features, target_variable_name)
|
||||
|
||||
model_input.to_csv(snakemake.output[0], index=False)
|
|
@ -1,11 +0,0 @@
|
|||
import pandas as pd
|
||||
|
||||
from helper import retain_target_column
|
||||
|
||||
cleaned_sensor_features = pd.read_csv(snakemake.input["cleaned_sensor_features"])
|
||||
target_variable_name = snakemake.params["target_variable"]
|
||||
|
||||
model_input = retain_target_column(cleaned_sensor_features, target_variable_name)
|
||||
model_input.dropna(axis ="index", how="any", subset=["target"], inplace=True)
|
||||
|
||||
model_input.to_csv(snakemake.output[0], index=False)
|
|
@ -24,12 +24,12 @@ def colors2colorscale(colors):
|
|||
def getDataForPlot(phone_data_yield_per_segment):
|
||||
# calculate the length (in minute) of per segment instance
|
||||
phone_data_yield_per_segment["length"] = phone_data_yield_per_segment["timestamps_segment"].str.split(",").apply(lambda x: int((int(x[1])-int(x[0])) / (1000 * 60)))
|
||||
# calculate the number of sensors logged at least one row of data per minute.
|
||||
phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(["local_segment", "length", "local_date", "local_hour", "local_minute"])[["sensor", "local_date_time"]].max().reset_index()
|
||||
# extract local start datetime of the segment from "local_segment" column
|
||||
phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime(phone_data_yield_per_segment["local_segment"].apply(lambda x: x.split("#")[1].split(",")[0]))
|
||||
# calculate the number of minutes after local start datetime of the segment
|
||||
phone_data_yield_per_segment["minutes_after_segment_start"] = ((phone_data_yield_per_segment["local_date_time"] - phone_data_yield_per_segment["local_segment_start_datetimes"]) / pd.Timedelta(minutes=1)).astype("int")
|
||||
# calculate the number of sensors logged at least one row of data per minute.
|
||||
phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(["local_segment", "length", "local_segment_start_datetimes", "minutes_after_segment_start"])[["sensor"]].max().reset_index()
|
||||
|
||||
# impute missing rows with 0
|
||||
columns_for_full_index = phone_data_yield_per_segment[["local_segment_start_datetimes", "length"]].drop_duplicates(keep="first")
|
||||
|
|
Loading…
Reference in New Issue