Add a rule to preprocess and clean ESM.
parent
d4a4bbbff0
commit
d470eef27e
|
@ -167,6 +167,10 @@ for provider in config["PHONE_CONVERSATION"]["PROVIDERS"].keys():
|
|||
for provider in config["PHONE_ESM"]["PROVIDERS"].keys():
|
||||
if config["PHONE_ESM"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
files_to_compute.extend(expand("data/raw/{pid}/phone_esm_raw.csv",pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/raw/{pid}/phone_esm_with_datetime.csv",pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/interim/{pid}/phone_esm_clean.csv",pid=config["PIDS"]))
|
||||
#files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv",pid=config["PIDS"]))
|
||||
#files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
|
||||
# We can delete these if's as soon as we add feature PROVIDERS to any of these sensors
|
||||
if isinstance(config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"], dict):
|
||||
|
|
|
@ -177,7 +177,6 @@ rule resample_episodes_with_datetime:
|
|||
script:
|
||||
"../src/data/datetime/readable_datetime.R"
|
||||
|
||||
|
||||
rule phone_application_categories:
|
||||
input:
|
||||
"data/raw/{pid}/phone_applications_{type}_with_datetime.csv"
|
||||
|
@ -191,6 +190,14 @@ rule phone_application_categories:
|
|||
script:
|
||||
"../src/data/application_categories.R"
|
||||
|
||||
rule preprocess_esm:
|
||||
input: "data/raw/{pid}/phone_esm_with_datetime.csv"
|
||||
params:
|
||||
questionnaire_ids = [8,9]
|
||||
output: "data/interim/{pid}/phone_esm_clean.csv"
|
||||
script:
|
||||
"../src/features/phone_esm/straw/preprocess.py"
|
||||
|
||||
rule pull_wearable_data:
|
||||
input: unpack(pull_wearable_data_input_with_mutation_scripts)
|
||||
params:
|
||||
|
|
|
@ -0,0 +1,113 @@
|
|||
import json
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
ESM_TYPE = {
|
||||
"text": 1,
|
||||
"radio": 2,
|
||||
"checkbox": 3,
|
||||
"likert": 4,
|
||||
"quick_answers": 5,
|
||||
"scale": 6,
|
||||
"datetime": 7,
|
||||
"pam": 8,
|
||||
"number": 9,
|
||||
"web": 10,
|
||||
"date": 11,
|
||||
}
|
||||
|
||||
ESM_STATUS_ANSWERED = 2
|
||||
|
||||
GROUP_SESSIONS_BY = ["participant_id", "device_id", "esm_session"]
|
||||
|
||||
SESSION_STATUS_UNANSWERED = "ema_unanswered"
|
||||
SESSION_STATUS_DAY_FINISHED = "day_finished"
|
||||
SESSION_STATUS_COMPLETE = "ema_completed"
|
||||
|
||||
ANSWER_DAY_FINISHED = "DayFinished3421"
|
||||
ANSWER_DAY_OFF = "DayOff3421"
|
||||
ANSWER_SET_EVENING = "DayFinishedSetEvening"
|
||||
|
||||
MAX_MORNING_LENGTH = 3
|
||||
# When the participants was not yet at work at the time of the first (morning) EMA,
|
||||
# only three items were answered.
|
||||
# Two sleep related items and one indicating NOT starting work yet.
|
||||
# Daytime EMAs are all longer, in fact they always consist of at least 6 items.
|
||||
|
||||
|
||||
def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Convert timestamps into human-readable datetimes and dates
|
||||
and expand the JSON column into several Pandas DF columns.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_esm: pd.DataFrame
|
||||
A dataframe of esm data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_esm_preprocessed: pd.DataFrame
|
||||
A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
|
||||
"""
|
||||
df_esm_json = df_esm["esm_json"].apply(json.loads)
|
||||
df_esm_json = pd.json_normalize(df_esm_json).drop(
|
||||
columns=["esm_trigger"]
|
||||
) # The esm_trigger column is already present in the main df.
|
||||
return df_esm.join(df_esm_json)
|
||||
|
||||
|
||||
def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
This function eliminates invalid ESM responses.
|
||||
It removes unanswered ESMs and those that indicate end of work and similar.
|
||||
It also extracts a numeric answer from strings such as "4 - I strongly agree".
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_esm_preprocessed: pd.DataFrame
|
||||
A preprocessed dataframe of esm data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_esm_clean: pd.DataFrame
|
||||
A subset of the original dataframe.
|
||||
|
||||
"""
|
||||
df_esm_clean = df_esm_preprocessed[
|
||||
df_esm_preprocessed["esm_status"] == ESM_STATUS_ANSWERED
|
||||
]
|
||||
df_esm_clean = df_esm_clean[
|
||||
~df_esm_clean["esm_user_answer"].isin(
|
||||
[ANSWER_DAY_FINISHED, ANSWER_DAY_OFF, ANSWER_SET_EVENING]
|
||||
)
|
||||
]
|
||||
df_esm_clean["esm_user_answer_numeric"] = np.nan
|
||||
esm_type_numeric = [
|
||||
ESM_TYPE.get("radio"),
|
||||
ESM_TYPE.get("scale"),
|
||||
ESM_TYPE.get("number"),
|
||||
]
|
||||
df_esm_clean.loc[
|
||||
df_esm_clean["esm_type"].isin(esm_type_numeric)
|
||||
] = df_esm_clean.loc[df_esm_clean["esm_type"].isin(esm_type_numeric)].assign(
|
||||
esm_user_answer_numeric=lambda x: x.esm_user_answer.str.slice(stop=1).astype(
|
||||
int
|
||||
)
|
||||
)
|
||||
return df_esm_clean
|
||||
|
||||
|
||||
df_esm = pd.read_csv(snakemake.input[0])
|
||||
df_esm_preprocessed = preprocess_esm(df_esm)
|
||||
#TODO Enable getting the right questionnaire here.
|
||||
df_esm_PANAS = df_esm_preprocessed[
|
||||
(df_esm_preprocessed["questionnaire_id"] == 8)
|
||||
| (df_esm_preprocessed["questionnaire_id"] == 9)
|
||||
]
|
||||
df_esm_clean = clean_up_esm(df_esm_PANAS)
|
||||
|
||||
df_esm_clean.to_csv(snakemake.output[0])
|
||||
|
||||
|
Loading…
Reference in New Issue