Add a rule to preprocess and clean ESM.

2022-03-09 18:38:46 +01:00 · 2022-03-09 18:38:46 +01:00 · d470eef27e
parent d4a4bbbff0
commit d470eef27e
3 changed files with 125 additions and 1 deletions
--- a/4
+++ b/4
@ -167,6 +167,10 @@ for provider in config["PHONE_CONVERSATION"]["PROVIDERS"].keys():
 for provider in config["PHONE_ESM"]["PROVIDERS"].keys():
    if config["PHONE_ESM"]["PROVIDERS"][provider]["COMPUTE"]:
        files_to_compute.extend(expand("data/raw/{pid}/phone_esm_raw.csv",pid=config["PIDS"]))
+        files_to_compute.extend(expand("data/raw/{pid}/phone_esm_with_datetime.csv",pid=config["PIDS"]))
+        files_to_compute.extend(expand("data/interim/{pid}/phone_esm_clean.csv",pid=config["PIDS"]))
+        #files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv",pid=config["PIDS"]))
+        #files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")

 # We can delete these if's as soon as we add feature PROVIDERS to any of these sensors
 if isinstance(config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"], dict):
--- a/rules/preprocessing.smk
+++ b/rules/preprocessing.smk
@ -177,7 +177,6 @@ rule resample_episodes_with_datetime:
    script:
        "../src/data/datetime/readable_datetime.R"

-
 rule phone_application_categories:
    input:
        "data/raw/{pid}/phone_applications_{type}_with_datetime.csv"
@ -191,6 +190,14 @@ rule phone_application_categories:
    script:
        "../src/data/application_categories.R"

+rule preprocess_esm:
+    input: "data/raw/{pid}/phone_esm_with_datetime.csv"
+    params:
+        questionnaire_ids = [8,9]
+    output: "data/interim/{pid}/phone_esm_clean.csv"
+    script:
+        "../src/features/phone_esm/straw/preprocess.py"
+
 rule pull_wearable_data:
    input: unpack(pull_wearable_data_input_with_mutation_scripts)
    params:
--- a/src/features/phone_esm/straw/preprocess.py
+++ b/src/features/phone_esm/straw/preprocess.py
@ -0,0 +1,113 @@
+import json
+import numpy as np
+import pandas as pd
+
+
+ESM_TYPE = {
+        "text": 1,
+        "radio": 2,
+        "checkbox": 3,
+        "likert": 4,
+        "quick_answers": 5,
+        "scale": 6,
+        "datetime": 7,
+        "pam": 8,
+        "number": 9,
+        "web": 10,
+        "date": 11,
+    }
+
+ESM_STATUS_ANSWERED = 2
+
+GROUP_SESSIONS_BY = ["participant_id", "device_id", "esm_session"]
+
+SESSION_STATUS_UNANSWERED = "ema_unanswered"
+SESSION_STATUS_DAY_FINISHED = "day_finished"
+SESSION_STATUS_COMPLETE = "ema_completed"
+
+ANSWER_DAY_FINISHED = "DayFinished3421"
+ANSWER_DAY_OFF = "DayOff3421"
+ANSWER_SET_EVENING = "DayFinishedSetEvening"
+
+MAX_MORNING_LENGTH = 3
+# When the participants was not yet at work at the time of the first (morning) EMA,
+# only three items were answered.
+# Two sleep related items and one indicating NOT starting work yet.
+# Daytime EMAs are all longer, in fact they always consist of at least 6 items.
+
+
+def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
+    """
+    Convert timestamps into human-readable datetimes and dates
+    and expand the JSON column into several Pandas DF columns.
+
+    Parameters
+    ----------
+    df_esm: pd.DataFrame
+        A dataframe of esm data.
+
+    Returns
+    -------
+    df_esm_preprocessed: pd.DataFrame
+        A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
+    """
+    df_esm_json = df_esm["esm_json"].apply(json.loads)
+    df_esm_json = pd.json_normalize(df_esm_json).drop(
+        columns=["esm_trigger"]
+    )  # The esm_trigger column is already present in the main df.
+    return df_esm.join(df_esm_json)
+
+
+def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
+    """
+    This function eliminates invalid ESM responses.
+    It removes unanswered ESMs and those that indicate end of work and similar.
+    It also extracts a numeric answer from strings such as "4 - I strongly agree".
+
+    Parameters
+    ----------
+    df_esm_preprocessed: pd.DataFrame
+        A preprocessed dataframe of esm data.
+
+    Returns
+    -------
+    df_esm_clean: pd.DataFrame
+        A subset of the original dataframe.
+
+    """
+    df_esm_clean = df_esm_preprocessed[
+        df_esm_preprocessed["esm_status"] == ESM_STATUS_ANSWERED
+    ]
+    df_esm_clean = df_esm_clean[
+        ~df_esm_clean["esm_user_answer"].isin(
+            [ANSWER_DAY_FINISHED, ANSWER_DAY_OFF, ANSWER_SET_EVENING]
+        )
+    ]
+    df_esm_clean["esm_user_answer_numeric"] = np.nan
+    esm_type_numeric = [
+        ESM_TYPE.get("radio"),
+        ESM_TYPE.get("scale"),
+        ESM_TYPE.get("number"),
+    ]
+    df_esm_clean.loc[
+        df_esm_clean["esm_type"].isin(esm_type_numeric)
+    ] = df_esm_clean.loc[df_esm_clean["esm_type"].isin(esm_type_numeric)].assign(
+        esm_user_answer_numeric=lambda x: x.esm_user_answer.str.slice(stop=1).astype(
+            int
+        )
+    )
+    return df_esm_clean
+
+
+df_esm = pd.read_csv(snakemake.input[0])
+df_esm_preprocessed = preprocess_esm(df_esm)
+#TODO Enable getting the right questionnaire here.
+df_esm_PANAS = df_esm_preprocessed[
+    (df_esm_preprocessed["questionnaire_id"] == 8)
+    | (df_esm_preprocessed["questionnaire_id"] == 9)
+]
+df_esm_clean = clean_up_esm(df_esm_PANAS)
+
+df_esm_clean.to_csv(snakemake.output[0])
+
+