Separate function definitions from main.
parent
ef57103bac
commit
19b9da0ba3
|
@ -0,0 +1,151 @@
|
||||||
|
import json
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
ESM_TYPE = {
|
||||||
|
"text": 1,
|
||||||
|
"radio": 2,
|
||||||
|
"checkbox": 3,
|
||||||
|
"likert": 4,
|
||||||
|
"quick_answers": 5,
|
||||||
|
"scale": 6,
|
||||||
|
"datetime": 7,
|
||||||
|
"pam": 8,
|
||||||
|
"number": 9,
|
||||||
|
"web": 10,
|
||||||
|
"date": 11,
|
||||||
|
}
|
||||||
|
|
||||||
|
QUESTIONNAIRE_IDS = {
|
||||||
|
"sleep_quality": 1,
|
||||||
|
"PANAS": {
|
||||||
|
"positive_affect": 8,
|
||||||
|
"negative_affect": 9
|
||||||
|
},
|
||||||
|
"job_content_questionnaire": {
|
||||||
|
"job_demand": 10,
|
||||||
|
"job_control": 11,
|
||||||
|
"supervisor_support": 12,
|
||||||
|
"coworker_support": 13,
|
||||||
|
},
|
||||||
|
"PFITS": {
|
||||||
|
"supervisor": 14,
|
||||||
|
"coworkers": 15
|
||||||
|
},
|
||||||
|
"UWES": {
|
||||||
|
"vigor": 16,
|
||||||
|
"dedication": 17,
|
||||||
|
"absorption": 18
|
||||||
|
},
|
||||||
|
"COPE": {
|
||||||
|
"active": 19,
|
||||||
|
"support": 20,
|
||||||
|
"emotions": 21
|
||||||
|
},
|
||||||
|
"work_life_balance": {
|
||||||
|
"life_work": 22,
|
||||||
|
"work_life": 23
|
||||||
|
},
|
||||||
|
"recovery_experience": {
|
||||||
|
"detachment": 24,
|
||||||
|
"relaxation": 25
|
||||||
|
},
|
||||||
|
"symptoms": 26,
|
||||||
|
"stress_appraisal": {
|
||||||
|
"stressfulness_event": 87,
|
||||||
|
"threat": 88,
|
||||||
|
"challenge": 89,
|
||||||
|
"event_time": 90,
|
||||||
|
"event_duration": 91,
|
||||||
|
"event_work_related": 92,
|
||||||
|
"stressfulness_period": 93,
|
||||||
|
},
|
||||||
|
"late_work": 94,
|
||||||
|
"work_hours": 95,
|
||||||
|
"left_work": 96,
|
||||||
|
"activities": 97,
|
||||||
|
"coffee_breaks": 98,
|
||||||
|
"at_work_yet": 99,
|
||||||
|
}
|
||||||
|
|
||||||
|
ESM_STATUS_ANSWERED = 2
|
||||||
|
|
||||||
|
GROUP_SESSIONS_BY = ["participant_id", "device_id", "esm_session"]
|
||||||
|
|
||||||
|
SESSION_STATUS_UNANSWERED = "ema_unanswered"
|
||||||
|
SESSION_STATUS_DAY_FINISHED = "day_finished"
|
||||||
|
SESSION_STATUS_COMPLETE = "ema_completed"
|
||||||
|
|
||||||
|
ANSWER_DAY_FINISHED = "DayFinished3421"
|
||||||
|
ANSWER_DAY_OFF = "DayOff3421"
|
||||||
|
ANSWER_SET_EVENING = "DayFinishedSetEvening"
|
||||||
|
|
||||||
|
MAX_MORNING_LENGTH = 3
|
||||||
|
# When the participants was not yet at work at the time of the first (morning) EMA,
|
||||||
|
# only three items were answered.
|
||||||
|
# Two sleep related items and one indicating NOT starting work yet.
|
||||||
|
# Daytime EMAs are all longer, in fact they always consist of at least 6 items.
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Convert timestamps into human-readable datetimes and dates
|
||||||
|
and expand the JSON column into several Pandas DF columns.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
df_esm: pd.DataFrame
|
||||||
|
A dataframe of esm data.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
df_esm_preprocessed: pd.DataFrame
|
||||||
|
A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
|
||||||
|
"""
|
||||||
|
df_esm_json = df_esm["esm_json"].apply(json.loads)
|
||||||
|
df_esm_json = pd.json_normalize(df_esm_json).drop(
|
||||||
|
columns=["esm_trigger"]
|
||||||
|
) # The esm_trigger column is already present in the main df.
|
||||||
|
return df_esm.join(df_esm_json)
|
||||||
|
|
||||||
|
|
||||||
|
def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
This function eliminates invalid ESM responses.
|
||||||
|
It removes unanswered ESMs and those that indicate end of work and similar.
|
||||||
|
It also extracts a numeric answer from strings such as "4 - I strongly agree".
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
df_esm_preprocessed: pd.DataFrame
|
||||||
|
A preprocessed dataframe of esm data.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
df_esm_clean: pd.DataFrame
|
||||||
|
A subset of the original dataframe.
|
||||||
|
|
||||||
|
"""
|
||||||
|
df_esm_clean = df_esm_preprocessed[
|
||||||
|
df_esm_preprocessed["esm_status"] == ESM_STATUS_ANSWERED
|
||||||
|
]
|
||||||
|
df_esm_clean = df_esm_clean[
|
||||||
|
~df_esm_clean["esm_user_answer"].isin(
|
||||||
|
[ANSWER_DAY_FINISHED, ANSWER_DAY_OFF, ANSWER_SET_EVENING]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
df_esm_clean["esm_user_answer_numeric"] = np.nan
|
||||||
|
esm_type_numeric = [
|
||||||
|
ESM_TYPE.get("radio"),
|
||||||
|
ESM_TYPE.get("scale"),
|
||||||
|
ESM_TYPE.get("number"),
|
||||||
|
]
|
||||||
|
df_esm_clean.loc[
|
||||||
|
df_esm_clean["esm_type"].isin(esm_type_numeric)
|
||||||
|
] = df_esm_clean.loc[df_esm_clean["esm_type"].isin(esm_type_numeric)].assign(
|
||||||
|
esm_user_answer_numeric=lambda x: x.esm_user_answer.str.slice(stop=1).astype(
|
||||||
|
int
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return df_esm_clean
|
|
@ -1,155 +1,4 @@
|
||||||
import json
|
from esm_preprocess import *
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
ESM_TYPE = {
|
|
||||||
"text": 1,
|
|
||||||
"radio": 2,
|
|
||||||
"checkbox": 3,
|
|
||||||
"likert": 4,
|
|
||||||
"quick_answers": 5,
|
|
||||||
"scale": 6,
|
|
||||||
"datetime": 7,
|
|
||||||
"pam": 8,
|
|
||||||
"number": 9,
|
|
||||||
"web": 10,
|
|
||||||
"date": 11,
|
|
||||||
}
|
|
||||||
|
|
||||||
QUESTIONNAIRE_IDS = {
|
|
||||||
"sleep_quality": 1,
|
|
||||||
"PANAS": {
|
|
||||||
"positive_affect": 8,
|
|
||||||
"negative_affect": 9
|
|
||||||
},
|
|
||||||
"job_content_questionnaire": {
|
|
||||||
"job_demand": 10,
|
|
||||||
"job_control": 11,
|
|
||||||
"supervisor_support": 12,
|
|
||||||
"coworker_support": 13,
|
|
||||||
},
|
|
||||||
"PFITS": {
|
|
||||||
"supervisor": 14,
|
|
||||||
"coworkers": 15
|
|
||||||
},
|
|
||||||
"UWES": {
|
|
||||||
"vigor": 16,
|
|
||||||
"dedication": 17,
|
|
||||||
"absorption": 18
|
|
||||||
},
|
|
||||||
"COPE": {
|
|
||||||
"active": 19,
|
|
||||||
"support": 20,
|
|
||||||
"emotions": 21
|
|
||||||
},
|
|
||||||
"work_life_balance": {
|
|
||||||
"life_work": 22,
|
|
||||||
"work_life": 23
|
|
||||||
},
|
|
||||||
"recovery_experience": {
|
|
||||||
"detachment": 24,
|
|
||||||
"relaxation": 25
|
|
||||||
},
|
|
||||||
"symptoms": 26,
|
|
||||||
"stress_appraisal": {
|
|
||||||
"stressfulness_event": 87,
|
|
||||||
"threat": 88,
|
|
||||||
"challenge": 89,
|
|
||||||
"event_time": 90,
|
|
||||||
"event_duration": 91,
|
|
||||||
"event_work_related": 92,
|
|
||||||
"stressfulness_period": 93,
|
|
||||||
},
|
|
||||||
"late_work": 94,
|
|
||||||
"work_hours": 95,
|
|
||||||
"left_work": 96,
|
|
||||||
"activities": 97,
|
|
||||||
"coffee_breaks": 98,
|
|
||||||
"at_work_yet": 99,
|
|
||||||
}
|
|
||||||
|
|
||||||
ESM_STATUS_ANSWERED = 2
|
|
||||||
|
|
||||||
GROUP_SESSIONS_BY = ["participant_id", "device_id", "esm_session"]
|
|
||||||
|
|
||||||
SESSION_STATUS_UNANSWERED = "ema_unanswered"
|
|
||||||
SESSION_STATUS_DAY_FINISHED = "day_finished"
|
|
||||||
SESSION_STATUS_COMPLETE = "ema_completed"
|
|
||||||
|
|
||||||
ANSWER_DAY_FINISHED = "DayFinished3421"
|
|
||||||
ANSWER_DAY_OFF = "DayOff3421"
|
|
||||||
ANSWER_SET_EVENING = "DayFinishedSetEvening"
|
|
||||||
|
|
||||||
MAX_MORNING_LENGTH = 3
|
|
||||||
# When the participants was not yet at work at the time of the first (morning) EMA,
|
|
||||||
# only three items were answered.
|
|
||||||
# Two sleep related items and one indicating NOT starting work yet.
|
|
||||||
# Daytime EMAs are all longer, in fact they always consist of at least 6 items.
|
|
||||||
|
|
||||||
|
|
||||||
def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Convert timestamps into human-readable datetimes and dates
|
|
||||||
and expand the JSON column into several Pandas DF columns.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
df_esm: pd.DataFrame
|
|
||||||
A dataframe of esm data.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
df_esm_preprocessed: pd.DataFrame
|
|
||||||
A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
|
|
||||||
"""
|
|
||||||
df_esm_json = df_esm["esm_json"].apply(json.loads)
|
|
||||||
df_esm_json = pd.json_normalize(df_esm_json).drop(
|
|
||||||
columns=["esm_trigger"]
|
|
||||||
) # The esm_trigger column is already present in the main df.
|
|
||||||
return df_esm.join(df_esm_json)
|
|
||||||
|
|
||||||
|
|
||||||
def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
This function eliminates invalid ESM responses.
|
|
||||||
It removes unanswered ESMs and those that indicate end of work and similar.
|
|
||||||
It also extracts a numeric answer from strings such as "4 - I strongly agree".
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
df_esm_preprocessed: pd.DataFrame
|
|
||||||
A preprocessed dataframe of esm data.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
df_esm_clean: pd.DataFrame
|
|
||||||
A subset of the original dataframe.
|
|
||||||
|
|
||||||
"""
|
|
||||||
df_esm_clean = df_esm_preprocessed[
|
|
||||||
df_esm_preprocessed["esm_status"] == ESM_STATUS_ANSWERED
|
|
||||||
]
|
|
||||||
df_esm_clean = df_esm_clean[
|
|
||||||
~df_esm_clean["esm_user_answer"].isin(
|
|
||||||
[ANSWER_DAY_FINISHED, ANSWER_DAY_OFF, ANSWER_SET_EVENING]
|
|
||||||
)
|
|
||||||
]
|
|
||||||
df_esm_clean["esm_user_answer_numeric"] = np.nan
|
|
||||||
esm_type_numeric = [
|
|
||||||
ESM_TYPE.get("radio"),
|
|
||||||
ESM_TYPE.get("scale"),
|
|
||||||
ESM_TYPE.get("number"),
|
|
||||||
]
|
|
||||||
df_esm_clean.loc[
|
|
||||||
df_esm_clean["esm_type"].isin(esm_type_numeric)
|
|
||||||
] = df_esm_clean.loc[df_esm_clean["esm_type"].isin(esm_type_numeric)].assign(
|
|
||||||
esm_user_answer_numeric=lambda x: x.esm_user_answer.str.slice(stop=1).astype(
|
|
||||||
int
|
|
||||||
)
|
|
||||||
)
|
|
||||||
return df_esm_clean
|
|
||||||
|
|
||||||
|
|
||||||
df_esm = pd.read_csv(snakemake.input[0])
|
df_esm = pd.read_csv(snakemake.input[0])
|
||||||
df_esm_preprocessed = preprocess_esm(df_esm)
|
df_esm_preprocessed = preprocess_esm(df_esm)
|
||||||
|
|
Loading…
Reference in New Issue