Merge branch 'labels' into run_test_participant

labels
junos 2022-03-16 17:09:53 +01:00
commit 155395512c
6 changed files with 179 additions and 1 deletions

View File

@ -167,6 +167,10 @@ for provider in config["PHONE_CONVERSATION"]["PROVIDERS"].keys():
for provider in config["PHONE_ESM"]["PROVIDERS"].keys():
if config["PHONE_ESM"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_esm_raw.csv",pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_esm_with_datetime.csv",pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_esm_clean.csv",pid=config["PIDS"]))
#files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv",pid=config["PIDS"]))
#files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
# We can delete these if's as soon as we add feature PROVIDERS to any of these sensors
if isinstance(config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"], dict):

View File

@ -645,3 +645,7 @@ PARAMS_FOR_ANALYSIS:
QUESTION_LIST: survey637813+question_text.csv
FEATURES: [age, gender, startlanguage, demand, control, demand_control_ratio]
CATEGORICAL_FEATURES: [gender]
TARGET:
SCALE: [positive_affect, negative_affect]

View File

@ -324,6 +324,14 @@ rule conversation_r_features:
script:
"../src/features/entry.R"
rule preprocess_esm:
input: "data/raw/{pid}/phone_esm_with_datetime.csv"
params:
questionnaire_ids = [8,9]
output: "data/interim/{pid}/phone_esm_clean.csv"
script:
"../src/features/phone_esm/straw/preprocess.py"
rule phone_keyboard_python_features:
input:
sensor_data = "data/raw/{pid}/phone_keyboard_with_datetime.csv",

View File

@ -177,7 +177,6 @@ rule resample_episodes_with_datetime:
script:
"../src/data/datetime/readable_datetime.R"
rule phone_application_categories:
input:
"data/raw/{pid}/phone_applications_{type}_with_datetime.csv"

View File

@ -0,0 +1,151 @@
import json
import numpy as np
import pandas as pd
ESM_TYPE = {
"text": 1,
"radio": 2,
"checkbox": 3,
"likert": 4,
"quick_answers": 5,
"scale": 6,
"datetime": 7,
"pam": 8,
"number": 9,
"web": 10,
"date": 11,
}
QUESTIONNAIRE_IDS = {
"sleep_quality": 1,
"PANAS": {
"positive_affect": 8,
"negative_affect": 9
},
"job_content_questionnaire": {
"job_demand": 10,
"job_control": 11,
"supervisor_support": 12,
"coworker_support": 13,
},
"PFITS": {
"supervisor": 14,
"coworkers": 15
},
"UWES": {
"vigor": 16,
"dedication": 17,
"absorption": 18
},
"COPE": {
"active": 19,
"support": 20,
"emotions": 21
},
"work_life_balance": {
"life_work": 22,
"work_life": 23
},
"recovery_experience": {
"detachment": 24,
"relaxation": 25
},
"symptoms": 26,
"stress_appraisal": {
"stressfulness_event": 87,
"threat": 88,
"challenge": 89,
"event_time": 90,
"event_duration": 91,
"event_work_related": 92,
"stressfulness_period": 93,
},
"late_work": 94,
"work_hours": 95,
"left_work": 96,
"activities": 97,
"coffee_breaks": 98,
"at_work_yet": 99,
}
ESM_STATUS_ANSWERED = 2
GROUP_SESSIONS_BY = ["participant_id", "device_id", "esm_session"]
SESSION_STATUS_UNANSWERED = "ema_unanswered"
SESSION_STATUS_DAY_FINISHED = "day_finished"
SESSION_STATUS_COMPLETE = "ema_completed"
ANSWER_DAY_FINISHED = "DayFinished3421"
ANSWER_DAY_OFF = "DayOff3421"
ANSWER_SET_EVENING = "DayFinishedSetEvening"
MAX_MORNING_LENGTH = 3
# When the participants was not yet at work at the time of the first (morning) EMA,
# only three items were answered.
# Two sleep related items and one indicating NOT starting work yet.
# Daytime EMAs are all longer, in fact they always consist of at least 6 items.
def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
"""
Convert timestamps into human-readable datetimes and dates
and expand the JSON column into several Pandas DF columns.
Parameters
----------
df_esm: pd.DataFrame
A dataframe of esm data.
Returns
-------
df_esm_preprocessed: pd.DataFrame
A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
"""
df_esm_json = df_esm["esm_json"].apply(json.loads)
df_esm_json = pd.json_normalize(df_esm_json).drop(
columns=["esm_trigger"]
) # The esm_trigger column is already present in the main df.
return df_esm.join(df_esm_json)
def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
"""
This function eliminates invalid ESM responses.
It removes unanswered ESMs and those that indicate end of work and similar.
It also extracts a numeric answer from strings such as "4 - I strongly agree".
Parameters
----------
df_esm_preprocessed: pd.DataFrame
A preprocessed dataframe of esm data.
Returns
-------
df_esm_clean: pd.DataFrame
A subset of the original dataframe.
"""
df_esm_clean = df_esm_preprocessed[
df_esm_preprocessed["esm_status"] == ESM_STATUS_ANSWERED
]
df_esm_clean = df_esm_clean[
~df_esm_clean["esm_user_answer"].isin(
[ANSWER_DAY_FINISHED, ANSWER_DAY_OFF, ANSWER_SET_EVENING]
)
]
df_esm_clean["esm_user_answer_numeric"] = np.nan
esm_type_numeric = [
ESM_TYPE.get("radio"),
ESM_TYPE.get("scale"),
ESM_TYPE.get("number"),
]
df_esm_clean.loc[
df_esm_clean["esm_type"].isin(esm_type_numeric)
] = df_esm_clean.loc[df_esm_clean["esm_type"].isin(esm_type_numeric)].assign(
esm_user_answer_numeric=lambda x: x.esm_user_answer.str.slice(stop=1).astype(
int
)
)
return df_esm_clean

View File

@ -0,0 +1,12 @@
from esm_preprocess import *
df_esm = pd.read_csv(snakemake.input[0])
df_esm_preprocessed = preprocess_esm(df_esm)
# TODO Enable getting the right questionnaire here.
df_esm_PANAS = df_esm_preprocessed[
(df_esm_preprocessed["questionnaire_id"] == 8)
| (df_esm_preprocessed["questionnaire_id"] == 9)
]
df_esm_clean = clean_up_esm(df_esm_PANAS)
df_esm_clean.to_csv(snakemake.output[0])