Modify the stress_event logic so that it includes where stressfulness is 0.

sociality-task
Primoz 2022-12-09 16:01:46 +00:00
parent 87e5209a9f
commit 3b2001f570
2 changed files with 32 additions and 22 deletions

View File

@ -3,7 +3,7 @@
######################################################################################################################## ########################################################################################################################
# See https://www.rapids.science/latest/setup/configuration/#participant-files # See https://www.rapids.science/latest/setup/configuration/#participant-files
PIDS: ['p031', 'p032', 'p033', 'p034', 'p035', 'p036', 'p037', 'p038', 'p039', 'p040', 'p042', 'p043', 'p044', 'p045', 'p046', 'p049', 'p050', 'p052', 'p053', 'p054', 'p055', 'p057', 'p058', 'p059', 'p060', 'p061', 'p062', 'p064', 'p067', 'p068', 'p069', 'p070', 'p071', 'p072', 'p073', 'p074', 'p075', 'p076', 'p077', 'p078', 'p079', 'p080', 'p081', 'p082', 'p083', 'p084', 'p085', 'p086', 'p088', 'p089', 'p090', 'p091', 'p092', 'p093', 'p106', 'p107'] PIDS: ['p03'] #['p031', 'p032', 'p033', 'p034', 'p035', 'p036', 'p037', 'p038', 'p039', 'p040', 'p042', 'p043', 'p044', 'p045', 'p046', 'p049', 'p050', 'p052', 'p053', 'p054', 'p055', 'p057', 'p058', 'p059', 'p060', 'p061', 'p062', 'p064', 'p067', 'p068', 'p069', 'p070', 'p071', 'p072', 'p073', 'p074', 'p075', 'p076', 'p077', 'p078', 'p079', 'p080', 'p081', 'p082', 'p083', 'p084', 'p085', 'p086', 'p088', 'p089', 'p090', 'p091', 'p092', 'p093', 'p106', 'p107']
# See https://www.rapids.science/latest/setup/configuration/#automatic-creation-of-participant-files # See https://www.rapids.science/latest/setup/configuration/#automatic-creation-of-participant-files
CREATE_PARTICIPANT_FILES: CREATE_PARTICIPANT_FILES:
@ -26,7 +26,7 @@ TIME_SEGMENTS: &time_segments
INCLUDE_PAST_PERIODIC_SEGMENTS: TRUE # Only relevant if TYPE=PERIODIC, see docs INCLUDE_PAST_PERIODIC_SEGMENTS: TRUE # Only relevant if TYPE=PERIODIC, see docs
TAILORED_EVENTS: # Only relevant if TYPE=EVENT TAILORED_EVENTS: # Only relevant if TYPE=EVENT
COMPUTE: True COMPUTE: True
SEGMENTING_METHOD: "30_before" # 30_before, 90_before, stress_event SEGMENTING_METHOD: "stress_event" # 30_before, 90_before, stress_event
# See https://www.rapids.science/latest/setup/configuration/#timezone-of-your-study # See https://www.rapids.science/latest/setup/configuration/#timezone-of-your-study
TIMEZONE: TIMEZONE:
@ -733,7 +733,6 @@ PARAMS_FOR_ANALYSIS:
TARGET: TARGET:
COMPUTE: True COMPUTE: True
LABEL: appraisal_stressfulness_event_mean LABEL: appraisal_stressfulness_event_mean
ALL_LABELS: [PANAS_positive_affect_mean, PANAS_negative_affect_mean, JCQ_job_demand_mean, JCQ_job_control_mean, JCQ_supervisor_support_mean, ALL_LABELS: [appraisal_stressfulness_event_mean, appraisal_threat_mean, appraisal_challenge_mean]
JCQ_coworker_support_mean, appraisal_stressfulness_period_mean, appraisal_stressfulness_event_mean, appraisal_threat_mean, appraisal_challenge_mean]
# PANAS_positive_affect_mean, PANAS_negative_affect_mean, JCQ_job_demand_mean, JCQ_job_control_mean, JCQ_supervisor_support_mean, # PANAS_positive_affect_mean, PANAS_negative_affect_mean, JCQ_job_demand_mean, JCQ_job_control_mean, JCQ_supervisor_support_mean,
# JCQ_coworker_support_mean, appraisal_stressfulness_period_mean, appraisal_stressfulness_event_mean, appraisal_threat_mean, appraisal_challenge_mean # JCQ_coworker_support_mean, appraisal_stressfulness_period_mean, appraisal_stressfulness_event_mean, appraisal_threat_mean, appraisal_challenge_mean

View File

@ -49,13 +49,13 @@ def extract_ers(esm_df):
extracted_ers (DataFrame): dataframe with all necessary information to write event-related segments file extracted_ers (DataFrame): dataframe with all necessary information to write event-related segments file
in the correct format. in the correct format.
""" """
pd.set_option("display.max_rows", 20) pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", None) pd.set_option("display.max_columns", None)
with open('config.yaml', 'r') as stream: with open('config.yaml', 'r') as stream:
config = yaml.load(stream, Loader=yaml.FullLoader) config = yaml.load(stream, Loader=yaml.FullLoader)
pd.DataFrame(columns=["label", "intensity"]).to_csv(snakemake.output[1]) # Create an empty stress_events_targets file pd.DataFrame(columns=["label"]).to_csv(snakemake.output[1]) # Create an empty stress_events_targets file
esm_preprocessed = clean_up_esm(preprocess_esm(esm_df)) esm_preprocessed = clean_up_esm(preprocess_esm(esm_df))
@ -114,14 +114,20 @@ def extract_ers(esm_df):
possiblity of the participant not remembering the start time percisely => this parameter can be manipulated with the variable possiblity of the participant not remembering the start time percisely => this parameter can be manipulated with the variable
"time_before_event" which is defined below. "time_before_event" which is defined below.
In case if the participant marked that no stressful event happened, the default of 30 minutes before the event is choosen.
In this case, se_threat and se_challenge are NaN.
By default, this method also excludes all events that are longer then 2.5 hours so that the segments are easily comparable. By default, this method also excludes all events that are longer then 2.5 hours so that the segments are easily comparable.
""" """
# Get and join required data # Get and join required data
extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index().rename(columns={'timestamp': 'session_length'}) # questionnaire end timestamp extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index().rename(columns={'timestamp': 'session_length'}) # questionnaire length
extracted_ers = extracted_ers[extracted_ers["session_length"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire anwsering is 15 min extracted_ers = extracted_ers[extracted_ers["session_length"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire answering is 15 min
session_end_timestamp = esm_df.groupby(['device_id', 'esm_session'])['timestamp'].max().to_frame().rename(columns={'timestamp': 'session_end_timestamp'}) # questionnaire end timestamp session_end_timestamp = esm_df.groupby(['device_id', 'esm_session'])['timestamp'].max().to_frame().rename(columns={'timestamp': 'session_end_timestamp'}) # questionnaire end timestamp
se_time = esm_df[esm_df.questionnaire_id == 90.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_time'}) se_time = esm_df[esm_df.questionnaire_id == 90.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_time'})
se_duration = esm_df[esm_df.questionnaire_id == 91.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_duration'}) se_duration = esm_df[esm_df.questionnaire_id == 91.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_duration'})
# Extracted 3 targets that will be transfered with the csv file to the cleaning script. # Extracted 3 targets that will be transfered with the csv file to the cleaning script.
se_stressfulness_event_tg = esm_df[esm_df.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_stressfulness_event'}) se_stressfulness_event_tg = esm_df[esm_df.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_stressfulness_event'})
@ -130,35 +136,40 @@ def extract_ers(esm_df):
# All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count) # All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count)
extracted_ers = extracted_ers.join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \ extracted_ers = extracted_ers.join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \
.join(se_time, on=['device_id', 'esm_session'], how='inner') \
.join(se_duration, on=['device_id', 'esm_session'], how='inner') \
.join(se_stressfulness_event_tg, on=['device_id', 'esm_session'], how='inner') \ .join(se_stressfulness_event_tg, on=['device_id', 'esm_session'], how='inner') \
.join(se_threat_tg, on=['device_id', 'esm_session'], how='inner') \ .join(se_time, on=['device_id', 'esm_session'], how='left') \
.join(se_challenge_tg, on=['device_id', 'esm_session'], how='inner') .join(se_duration, on=['device_id', 'esm_session'], how='left') \
.join(se_threat_tg, on=['device_id', 'esm_session'], how='left') \
.join(se_challenge_tg, on=['device_id', 'esm_session'], how='left')
# Filter-out the sessions that are not useful. Because of the ambiguity this excludes:
# Filter sessions that are not useful. Because of the ambiguity this excludes:
# (1) straw event times that are marked as "0 - I don't remember" # (1) straw event times that are marked as "0 - I don't remember"
# (2) straw event durations that are marked as "0 - I don't remember" # (2) straw event durations that are marked as "0 - I don't remember"
extracted_ers = extracted_ers[(~extracted_ers.se_time.str.startswith("0 - ")) & (~extracted_ers.se_duration.str.startswith("0 - "))] extracted_ers = extracted_ers[(~extracted_ers.se_time.astype(str).str.startswith("0 - ")) & (~extracted_ers.se_duration.astype(str).str.startswith("0 - "))]
# Transform data into its final form, ready for the extraction
extracted_ers.reset_index(drop=True, inplace=True) extracted_ers.reset_index(drop=True, inplace=True)
time_before_event = 5 * 60 # in seconds (5 minutes) # Add default duration in case if participant answered that no stressful event occured
extracted_ers['event_timestamp'] = pd.to_datetime(extracted_ers['se_time']).apply(lambda x: x.timestamp() * 1000).astype('int64') def_time_before_questionnaire = 25 * 60 # in seconds (25 minutes.. 5 minutes will be added later) - * 1000 to standardize it in miliseconds
extracted_ers['shift_direction'] = -1 extracted_ers["se_duration"] = extracted_ers["se_duration"].fillna((extracted_ers["session_length"] + def_time_before_questionnaire).astype(int) * 1000)
# Prepare data to fit the data structure in the CSV file ...
# Add the event time as the end of the questionnaire if no stress event occured
extracted_ers['se_time'] = extracted_ers['se_time'].fillna(extracted_ers['session_end_timestamp'])
# Type could be an int (timestamp [ms]) which stays the same, and datetime str which is converted to timestamp in miliseconds
extracted_ers['event_timestamp'] = extracted_ers['se_time'].apply(lambda x: x if isinstance(x, int) else pd.to_datetime(x).timestamp() * 1000).astype('int64')
extracted_ers['shift_direction'] = -1
# Checks whether the duration is marked with "1 - It's still ongoing" which means that the end of the current questionnaire # Checks whether the duration is marked with "1 - It's still ongoing" which means that the end of the current questionnaire
# is taken as end time of the segment. Else the user input duration is taken. # is taken as end time of the segment. Else the user input duration is taken.
extracted_ers['se_duration'] = \ extracted_ers['se_duration'] = \
np.where( np.where(
extracted_ers['se_duration'].str.startswith("1 - "), extracted_ers['se_duration'].astype(str).str.startswith("1 - "),
extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'], extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'],
extracted_ers['se_duration'] extracted_ers['se_duration']
) )
# This converts the rows of timestamps in miliseconds and the row with datetime to timestamp in seconds. # This converts the rows of timestamps in miliseconds and the rows with datetime... to timestamp in seconds.
time_before_event = 5 * 60 # in seconds (5 minutes)
extracted_ers['se_duration'] = \ extracted_ers['se_duration'] = \
extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else (pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60) + time_before_event extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else (pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60) + time_before_event