From d470eef27e09985d5a3dedc8369d772397fb0624 Mon Sep 17 00:00:00 2001 From: junos Date: Wed, 9 Mar 2022 18:38:46 +0100 Subject: [PATCH 1/5] Add a rule to preprocess and clean ESM. --- Snakefile | 4 + rules/preprocessing.smk | 9 +- src/features/phone_esm/straw/preprocess.py | 113 +++++++++++++++++++++ 3 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 src/features/phone_esm/straw/preprocess.py diff --git a/Snakefile b/Snakefile index 95cfbc95..24cdb73e 100644 --- a/Snakefile +++ b/Snakefile @@ -167,6 +167,10 @@ for provider in config["PHONE_CONVERSATION"]["PROVIDERS"].keys(): for provider in config["PHONE_ESM"]["PROVIDERS"].keys(): if config["PHONE_ESM"]["PROVIDERS"][provider]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/phone_esm_raw.csv",pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_esm_with_datetime.csv",pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_esm_clean.csv",pid=config["PIDS"])) + #files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv",pid=config["PIDS"])) + #files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") # We can delete these if's as soon as we add feature PROVIDERS to any of these sensors if isinstance(config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"], dict): diff --git a/rules/preprocessing.smk b/rules/preprocessing.smk index cd83b95b..154556f3 100644 --- a/rules/preprocessing.smk +++ b/rules/preprocessing.smk @@ -177,7 +177,6 @@ rule resample_episodes_with_datetime: script: "../src/data/datetime/readable_datetime.R" - rule phone_application_categories: input: "data/raw/{pid}/phone_applications_{type}_with_datetime.csv" @@ -191,6 +190,14 @@ rule phone_application_categories: script: "../src/data/application_categories.R" +rule preprocess_esm: + input: "data/raw/{pid}/phone_esm_with_datetime.csv" + params: + questionnaire_ids = [8,9] + output: "data/interim/{pid}/phone_esm_clean.csv" + script: + "../src/features/phone_esm/straw/preprocess.py" + rule pull_wearable_data: input: unpack(pull_wearable_data_input_with_mutation_scripts) params: diff --git a/src/features/phone_esm/straw/preprocess.py b/src/features/phone_esm/straw/preprocess.py new file mode 100644 index 00000000..7a38ecfe --- /dev/null +++ b/src/features/phone_esm/straw/preprocess.py @@ -0,0 +1,113 @@ +import json +import numpy as np +import pandas as pd + + +ESM_TYPE = { + "text": 1, + "radio": 2, + "checkbox": 3, + "likert": 4, + "quick_answers": 5, + "scale": 6, + "datetime": 7, + "pam": 8, + "number": 9, + "web": 10, + "date": 11, + } + +ESM_STATUS_ANSWERED = 2 + +GROUP_SESSIONS_BY = ["participant_id", "device_id", "esm_session"] + +SESSION_STATUS_UNANSWERED = "ema_unanswered" +SESSION_STATUS_DAY_FINISHED = "day_finished" +SESSION_STATUS_COMPLETE = "ema_completed" + +ANSWER_DAY_FINISHED = "DayFinished3421" +ANSWER_DAY_OFF = "DayOff3421" +ANSWER_SET_EVENING = "DayFinishedSetEvening" + +MAX_MORNING_LENGTH = 3 +# When the participants was not yet at work at the time of the first (morning) EMA, +# only three items were answered. +# Two sleep related items and one indicating NOT starting work yet. +# Daytime EMAs are all longer, in fact they always consist of at least 6 items. + + +def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame: + """ + Convert timestamps into human-readable datetimes and dates + and expand the JSON column into several Pandas DF columns. + + Parameters + ---------- + df_esm: pd.DataFrame + A dataframe of esm data. + + Returns + ------- + df_esm_preprocessed: pd.DataFrame + A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column. + """ + df_esm_json = df_esm["esm_json"].apply(json.loads) + df_esm_json = pd.json_normalize(df_esm_json).drop( + columns=["esm_trigger"] + ) # The esm_trigger column is already present in the main df. + return df_esm.join(df_esm_json) + + +def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame: + """ + This function eliminates invalid ESM responses. + It removes unanswered ESMs and those that indicate end of work and similar. + It also extracts a numeric answer from strings such as "4 - I strongly agree". + + Parameters + ---------- + df_esm_preprocessed: pd.DataFrame + A preprocessed dataframe of esm data. + + Returns + ------- + df_esm_clean: pd.DataFrame + A subset of the original dataframe. + + """ + df_esm_clean = df_esm_preprocessed[ + df_esm_preprocessed["esm_status"] == ESM_STATUS_ANSWERED + ] + df_esm_clean = df_esm_clean[ + ~df_esm_clean["esm_user_answer"].isin( + [ANSWER_DAY_FINISHED, ANSWER_DAY_OFF, ANSWER_SET_EVENING] + ) + ] + df_esm_clean["esm_user_answer_numeric"] = np.nan + esm_type_numeric = [ + ESM_TYPE.get("radio"), + ESM_TYPE.get("scale"), + ESM_TYPE.get("number"), + ] + df_esm_clean.loc[ + df_esm_clean["esm_type"].isin(esm_type_numeric) + ] = df_esm_clean.loc[df_esm_clean["esm_type"].isin(esm_type_numeric)].assign( + esm_user_answer_numeric=lambda x: x.esm_user_answer.str.slice(stop=1).astype( + int + ) + ) + return df_esm_clean + + +df_esm = pd.read_csv(snakemake.input[0]) +df_esm_preprocessed = preprocess_esm(df_esm) +#TODO Enable getting the right questionnaire here. +df_esm_PANAS = df_esm_preprocessed[ + (df_esm_preprocessed["questionnaire_id"] == 8) + | (df_esm_preprocessed["questionnaire_id"] == 9) +] +df_esm_clean = clean_up_esm(df_esm_PANAS) + +df_esm_clean.to_csv(snakemake.output[0]) + + From 5f293211a7fecea8b9943001bc8820a7a8b9f86a Mon Sep 17 00:00:00 2001 From: junos Date: Tue, 15 Mar 2022 13:28:51 +0100 Subject: [PATCH 2/5] Reformat. --- src/features/phone_esm/straw/preprocess.py | 30 ++++++++++------------ 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/src/features/phone_esm/straw/preprocess.py b/src/features/phone_esm/straw/preprocess.py index 7a38ecfe..d6279d9c 100644 --- a/src/features/phone_esm/straw/preprocess.py +++ b/src/features/phone_esm/straw/preprocess.py @@ -1,21 +1,21 @@ import json + import numpy as np import pandas as pd - ESM_TYPE = { - "text": 1, - "radio": 2, - "checkbox": 3, - "likert": 4, - "quick_answers": 5, - "scale": 6, - "datetime": 7, - "pam": 8, - "number": 9, - "web": 10, - "date": 11, - } + "text": 1, + "radio": 2, + "checkbox": 3, + "likert": 4, + "quick_answers": 5, + "scale": 6, + "datetime": 7, + "pam": 8, + "number": 9, + "web": 10, + "date": 11, +} ESM_STATUS_ANSWERED = 2 @@ -101,7 +101,7 @@ def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame: df_esm = pd.read_csv(snakemake.input[0]) df_esm_preprocessed = preprocess_esm(df_esm) -#TODO Enable getting the right questionnaire here. +# TODO Enable getting the right questionnaire here. df_esm_PANAS = df_esm_preprocessed[ (df_esm_preprocessed["questionnaire_id"] == 8) | (df_esm_preprocessed["questionnaire_id"] == 9) @@ -109,5 +109,3 @@ df_esm_PANAS = df_esm_preprocessed[ df_esm_clean = clean_up_esm(df_esm_PANAS) df_esm_clean.to_csv(snakemake.output[0]) - - From ef57103bac79faedfd498cfa56b5efb0dd608207 Mon Sep 17 00:00:00 2001 From: junos Date: Tue, 15 Mar 2022 13:41:33 +0100 Subject: [PATCH 3/5] Add questionnaire ID key. --- config.yaml | 4 ++ src/features/phone_esm/straw/preprocess.py | 52 ++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/config.yaml b/config.yaml index 92578437..5c774a6e 100644 --- a/config.yaml +++ b/config.yaml @@ -645,3 +645,7 @@ PARAMS_FOR_ANALYSIS: QUESTION_LIST: survey637813+question_text.csv FEATURES: [age, gender, startlanguage, demand, control, demand_control_ratio] CATEGORICAL_FEATURES: [gender] + + TARGET: + SCALE: [positive_affect, negative_affect] + diff --git a/src/features/phone_esm/straw/preprocess.py b/src/features/phone_esm/straw/preprocess.py index d6279d9c..cd99d906 100644 --- a/src/features/phone_esm/straw/preprocess.py +++ b/src/features/phone_esm/straw/preprocess.py @@ -17,6 +17,58 @@ ESM_TYPE = { "date": 11, } +QUESTIONNAIRE_IDS = { + "sleep_quality": 1, + "PANAS": { + "positive_affect": 8, + "negative_affect": 9 + }, + "job_content_questionnaire": { + "job_demand": 10, + "job_control": 11, + "supervisor_support": 12, + "coworker_support": 13, + }, + "PFITS": { + "supervisor": 14, + "coworkers": 15 + }, + "UWES": { + "vigor": 16, + "dedication": 17, + "absorption": 18 + }, + "COPE": { + "active": 19, + "support": 20, + "emotions": 21 + }, + "work_life_balance": { + "life_work": 22, + "work_life": 23 + }, + "recovery_experience": { + "detachment": 24, + "relaxation": 25 + }, + "symptoms": 26, + "stress_appraisal": { + "stressfulness_event": 87, + "threat": 88, + "challenge": 89, + "event_time": 90, + "event_duration": 91, + "event_work_related": 92, + "stressfulness_period": 93, + }, + "late_work": 94, + "work_hours": 95, + "left_work": 96, + "activities": 97, + "coffee_breaks": 98, + "at_work_yet": 99, +} + ESM_STATUS_ANSWERED = 2 GROUP_SESSIONS_BY = ["participant_id", "device_id", "esm_session"] From 19b9da0ba367b67e4359756370f0ed747cc36237 Mon Sep 17 00:00:00 2001 From: junos Date: Wed, 16 Mar 2022 16:49:28 +0100 Subject: [PATCH 4/5] Separate function definitions from main. --- .../phone_esm/straw/esm_preprocess.py | 151 +++++++++++++++++ src/features/phone_esm/straw/preprocess.py | 153 +----------------- 2 files changed, 152 insertions(+), 152 deletions(-) create mode 100644 src/features/phone_esm/straw/esm_preprocess.py diff --git a/src/features/phone_esm/straw/esm_preprocess.py b/src/features/phone_esm/straw/esm_preprocess.py new file mode 100644 index 00000000..876be61e --- /dev/null +++ b/src/features/phone_esm/straw/esm_preprocess.py @@ -0,0 +1,151 @@ +import json + +import numpy as np +import pandas as pd + +ESM_TYPE = { + "text": 1, + "radio": 2, + "checkbox": 3, + "likert": 4, + "quick_answers": 5, + "scale": 6, + "datetime": 7, + "pam": 8, + "number": 9, + "web": 10, + "date": 11, +} + +QUESTIONNAIRE_IDS = { + "sleep_quality": 1, + "PANAS": { + "positive_affect": 8, + "negative_affect": 9 + }, + "job_content_questionnaire": { + "job_demand": 10, + "job_control": 11, + "supervisor_support": 12, + "coworker_support": 13, + }, + "PFITS": { + "supervisor": 14, + "coworkers": 15 + }, + "UWES": { + "vigor": 16, + "dedication": 17, + "absorption": 18 + }, + "COPE": { + "active": 19, + "support": 20, + "emotions": 21 + }, + "work_life_balance": { + "life_work": 22, + "work_life": 23 + }, + "recovery_experience": { + "detachment": 24, + "relaxation": 25 + }, + "symptoms": 26, + "stress_appraisal": { + "stressfulness_event": 87, + "threat": 88, + "challenge": 89, + "event_time": 90, + "event_duration": 91, + "event_work_related": 92, + "stressfulness_period": 93, + }, + "late_work": 94, + "work_hours": 95, + "left_work": 96, + "activities": 97, + "coffee_breaks": 98, + "at_work_yet": 99, +} + +ESM_STATUS_ANSWERED = 2 + +GROUP_SESSIONS_BY = ["participant_id", "device_id", "esm_session"] + +SESSION_STATUS_UNANSWERED = "ema_unanswered" +SESSION_STATUS_DAY_FINISHED = "day_finished" +SESSION_STATUS_COMPLETE = "ema_completed" + +ANSWER_DAY_FINISHED = "DayFinished3421" +ANSWER_DAY_OFF = "DayOff3421" +ANSWER_SET_EVENING = "DayFinishedSetEvening" + +MAX_MORNING_LENGTH = 3 +# When the participants was not yet at work at the time of the first (morning) EMA, +# only three items were answered. +# Two sleep related items and one indicating NOT starting work yet. +# Daytime EMAs are all longer, in fact they always consist of at least 6 items. + + +def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame: + """ + Convert timestamps into human-readable datetimes and dates + and expand the JSON column into several Pandas DF columns. + + Parameters + ---------- + df_esm: pd.DataFrame + A dataframe of esm data. + + Returns + ------- + df_esm_preprocessed: pd.DataFrame + A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column. + """ + df_esm_json = df_esm["esm_json"].apply(json.loads) + df_esm_json = pd.json_normalize(df_esm_json).drop( + columns=["esm_trigger"] + ) # The esm_trigger column is already present in the main df. + return df_esm.join(df_esm_json) + + +def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame: + """ + This function eliminates invalid ESM responses. + It removes unanswered ESMs and those that indicate end of work and similar. + It also extracts a numeric answer from strings such as "4 - I strongly agree". + + Parameters + ---------- + df_esm_preprocessed: pd.DataFrame + A preprocessed dataframe of esm data. + + Returns + ------- + df_esm_clean: pd.DataFrame + A subset of the original dataframe. + + """ + df_esm_clean = df_esm_preprocessed[ + df_esm_preprocessed["esm_status"] == ESM_STATUS_ANSWERED + ] + df_esm_clean = df_esm_clean[ + ~df_esm_clean["esm_user_answer"].isin( + [ANSWER_DAY_FINISHED, ANSWER_DAY_OFF, ANSWER_SET_EVENING] + ) + ] + df_esm_clean["esm_user_answer_numeric"] = np.nan + esm_type_numeric = [ + ESM_TYPE.get("radio"), + ESM_TYPE.get("scale"), + ESM_TYPE.get("number"), + ] + df_esm_clean.loc[ + df_esm_clean["esm_type"].isin(esm_type_numeric) + ] = df_esm_clean.loc[df_esm_clean["esm_type"].isin(esm_type_numeric)].assign( + esm_user_answer_numeric=lambda x: x.esm_user_answer.str.slice(stop=1).astype( + int + ) + ) + return df_esm_clean diff --git a/src/features/phone_esm/straw/preprocess.py b/src/features/phone_esm/straw/preprocess.py index cd99d906..a46d04ec 100644 --- a/src/features/phone_esm/straw/preprocess.py +++ b/src/features/phone_esm/straw/preprocess.py @@ -1,155 +1,4 @@ -import json - -import numpy as np -import pandas as pd - -ESM_TYPE = { - "text": 1, - "radio": 2, - "checkbox": 3, - "likert": 4, - "quick_answers": 5, - "scale": 6, - "datetime": 7, - "pam": 8, - "number": 9, - "web": 10, - "date": 11, -} - -QUESTIONNAIRE_IDS = { - "sleep_quality": 1, - "PANAS": { - "positive_affect": 8, - "negative_affect": 9 - }, - "job_content_questionnaire": { - "job_demand": 10, - "job_control": 11, - "supervisor_support": 12, - "coworker_support": 13, - }, - "PFITS": { - "supervisor": 14, - "coworkers": 15 - }, - "UWES": { - "vigor": 16, - "dedication": 17, - "absorption": 18 - }, - "COPE": { - "active": 19, - "support": 20, - "emotions": 21 - }, - "work_life_balance": { - "life_work": 22, - "work_life": 23 - }, - "recovery_experience": { - "detachment": 24, - "relaxation": 25 - }, - "symptoms": 26, - "stress_appraisal": { - "stressfulness_event": 87, - "threat": 88, - "challenge": 89, - "event_time": 90, - "event_duration": 91, - "event_work_related": 92, - "stressfulness_period": 93, - }, - "late_work": 94, - "work_hours": 95, - "left_work": 96, - "activities": 97, - "coffee_breaks": 98, - "at_work_yet": 99, -} - -ESM_STATUS_ANSWERED = 2 - -GROUP_SESSIONS_BY = ["participant_id", "device_id", "esm_session"] - -SESSION_STATUS_UNANSWERED = "ema_unanswered" -SESSION_STATUS_DAY_FINISHED = "day_finished" -SESSION_STATUS_COMPLETE = "ema_completed" - -ANSWER_DAY_FINISHED = "DayFinished3421" -ANSWER_DAY_OFF = "DayOff3421" -ANSWER_SET_EVENING = "DayFinishedSetEvening" - -MAX_MORNING_LENGTH = 3 -# When the participants was not yet at work at the time of the first (morning) EMA, -# only three items were answered. -# Two sleep related items and one indicating NOT starting work yet. -# Daytime EMAs are all longer, in fact they always consist of at least 6 items. - - -def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame: - """ - Convert timestamps into human-readable datetimes and dates - and expand the JSON column into several Pandas DF columns. - - Parameters - ---------- - df_esm: pd.DataFrame - A dataframe of esm data. - - Returns - ------- - df_esm_preprocessed: pd.DataFrame - A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column. - """ - df_esm_json = df_esm["esm_json"].apply(json.loads) - df_esm_json = pd.json_normalize(df_esm_json).drop( - columns=["esm_trigger"] - ) # The esm_trigger column is already present in the main df. - return df_esm.join(df_esm_json) - - -def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame: - """ - This function eliminates invalid ESM responses. - It removes unanswered ESMs and those that indicate end of work and similar. - It also extracts a numeric answer from strings such as "4 - I strongly agree". - - Parameters - ---------- - df_esm_preprocessed: pd.DataFrame - A preprocessed dataframe of esm data. - - Returns - ------- - df_esm_clean: pd.DataFrame - A subset of the original dataframe. - - """ - df_esm_clean = df_esm_preprocessed[ - df_esm_preprocessed["esm_status"] == ESM_STATUS_ANSWERED - ] - df_esm_clean = df_esm_clean[ - ~df_esm_clean["esm_user_answer"].isin( - [ANSWER_DAY_FINISHED, ANSWER_DAY_OFF, ANSWER_SET_EVENING] - ) - ] - df_esm_clean["esm_user_answer_numeric"] = np.nan - esm_type_numeric = [ - ESM_TYPE.get("radio"), - ESM_TYPE.get("scale"), - ESM_TYPE.get("number"), - ] - df_esm_clean.loc[ - df_esm_clean["esm_type"].isin(esm_type_numeric) - ] = df_esm_clean.loc[df_esm_clean["esm_type"].isin(esm_type_numeric)].assign( - esm_user_answer_numeric=lambda x: x.esm_user_answer.str.slice(stop=1).astype( - int - ) - ) - return df_esm_clean - +from esm_preprocess import * df_esm = pd.read_csv(snakemake.input[0]) df_esm_preprocessed = preprocess_esm(df_esm) From cb116100dd50b7a86b5aabfd7b25cc14dd3fc06d Mon Sep 17 00:00:00 2001 From: junos Date: Wed, 16 Mar 2022 17:06:42 +0100 Subject: [PATCH 5/5] Move preprocessing to features. --- rules/features.smk | 8 ++++++++ rules/preprocessing.smk | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/rules/features.smk b/rules/features.smk index f0fea945..defe843a 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -324,6 +324,14 @@ rule conversation_r_features: script: "../src/features/entry.R" +rule preprocess_esm: + input: "data/raw/{pid}/phone_esm_with_datetime.csv" + params: + questionnaire_ids = [8,9] + output: "data/interim/{pid}/phone_esm_clean.csv" + script: + "../src/features/phone_esm/straw/preprocess.py" + rule phone_keyboard_python_features: input: sensor_data = "data/raw/{pid}/phone_keyboard_with_datetime.csv", diff --git a/rules/preprocessing.smk b/rules/preprocessing.smk index 154556f3..83608204 100644 --- a/rules/preprocessing.smk +++ b/rules/preprocessing.smk @@ -190,14 +190,6 @@ rule phone_application_categories: script: "../src/data/application_categories.R" -rule preprocess_esm: - input: "data/raw/{pid}/phone_esm_with_datetime.csv" - params: - questionnaire_ids = [8,9] - output: "data/interim/{pid}/phone_esm_clean.csv" - script: - "../src/features/phone_esm/straw/preprocess.py" - rule pull_wearable_data: input: unpack(pull_wearable_data_input_with_mutation_scripts) params: