Make stress events to be equal in duration.

sociality-task
Primoz 2022-12-14 14:52:20 +00:00
parent 3ce7f2c2a5
commit 7f5a4e6744
3 changed files with 46 additions and 15 deletions

View File

@ -27,6 +27,8 @@ TIME_SEGMENTS: &time_segments
TAILORED_EVENTS: # Only relevant if TYPE=EVENT TAILORED_EVENTS: # Only relevant if TYPE=EVENT
COMPUTE: True COMPUTE: True
SEGMENTING_METHOD: "stress_event" # 30_before, 90_before, stress_event SEGMENTING_METHOD: "stress_event" # 30_before, 90_before, stress_event
INTERVAL_OF_INTEREST: 5 # duration of event of interest [minutes]
INTERVAL_OF_INTEREST_PADDING: 7.5 # interval of interest padding (before and after IOI) [minutes]
# See https://www.rapids.science/latest/setup/configuration/#timezone-of-your-study # See https://www.rapids.science/latest/setup/configuration/#timezone-of-your-study
TIMEZONE: TIMEZONE:

View File

@ -23,6 +23,9 @@ def straw_cleaning(sensor_data_files, provider, target):
graph_bf_af(features, "1target_rows_before") graph_bf_af(features, "1target_rows_before")
# TODO: need to check whether all of the participants contain E4 columns
sys.exit()
# (1.0) OVERRIDE STRESSFULNESS EVENT TARGETS IF ERS SEGMENTING_METHOD IS "STRESS_EVENT" # (1.0) OVERRIDE STRESSFULNESS EVENT TARGETS IF ERS SEGMENTING_METHOD IS "STRESS_EVENT"
if config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"] == "stress_event": if config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"] == "stress_event":

View File

@ -49,7 +49,8 @@ def extract_ers(esm_df):
extracted_ers (DataFrame): dataframe with all necessary information to write event-related segments file extracted_ers (DataFrame): dataframe with all necessary information to write event-related segments file
in the correct format. in the correct format.
""" """
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", None) pd.set_option("display.max_columns", None)
with open('config.yaml', 'r') as stream: with open('config.yaml', 'r') as stream:
@ -105,7 +106,9 @@ def extract_ers(esm_df):
extracted_ers["shift"] = extracted_ers["diffs"].apply(lambda x: format_timestamp(x)) extracted_ers["shift"] = extracted_ers["diffs"].apply(lambda x: format_timestamp(x))
elif segmenting_method == "stress_event": elif segmenting_method == "stress_event":
"""This is a special case of the method as it consists of two important parts: """
TODO: update documentation for this condition
This is a special case of the method as it consists of two important parts:
(1) Generating of the ERS file (same as the methods above) and (1) Generating of the ERS file (same as the methods above) and
(2) Generating targets file alongside with the correct time segment labels. (2) Generating targets file alongside with the correct time segment labels.
@ -120,22 +123,29 @@ def extract_ers(esm_df):
By default, this method also excludes all events that are longer then 2.5 hours so that the segments are easily comparable. By default, this method also excludes all events that are longer then 2.5 hours so that the segments are easily comparable.
""" """
ioi = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["INTERVAL_OF_INTEREST"] * 60 # interval of interest in seconds
ioi_padding = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["INTERVAL_OF_INTEREST_PADDING"] * 60 # interval of interest padding in seconds
# Get and join required data # Get and join required data
extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index().rename(columns={'timestamp': 'session_length'}) # questionnaire length extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index().rename(columns={'timestamp': 'session_length'}) # questionnaire length
extracted_ers = extracted_ers[extracted_ers["session_length"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire answering is 15 min extracted_ers = extracted_ers[extracted_ers["session_length"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire answering is 15 min
session_start_timestamp = esm_df.groupby(['device_id', 'esm_session'])['timestamp'].min().to_frame().rename(columns={'timestamp': 'session_start_timestamp'}) # questionnaire start timestamp
session_end_timestamp = esm_df.groupby(['device_id', 'esm_session'])['timestamp'].max().to_frame().rename(columns={'timestamp': 'session_end_timestamp'}) # questionnaire end timestamp session_end_timestamp = esm_df.groupby(['device_id', 'esm_session'])['timestamp'].max().to_frame().rename(columns={'timestamp': 'session_end_timestamp'}) # questionnaire end timestamp
# Users' answers for the stressfulness event (se) start times and durations
se_time = esm_df[esm_df.questionnaire_id == 90.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_time'}) se_time = esm_df[esm_df.questionnaire_id == 90.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_time'})
se_duration = esm_df[esm_df.questionnaire_id == 91.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_duration'}) se_duration = esm_df[esm_df.questionnaire_id == 91.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_duration'})
# Make se_durations to the appropriate lengths
# Extracted 3 targets that will be transfered with the csv file to the cleaning script. # Extracted 3 targets that will be transfered in the csv file to the cleaning script.
se_stressfulness_event_tg = esm_df[esm_df.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_stressfulness_event'}) se_stressfulness_event_tg = esm_df[esm_df.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_stressfulness_event'})
se_threat_tg = esm_df[esm_df.questionnaire_id == 88.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_threat'}) se_threat_tg = esm_df[esm_df.questionnaire_id == 88.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_threat'})
se_challenge_tg = esm_df[esm_df.questionnaire_id == 89.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_challenge'}) se_challenge_tg = esm_df[esm_df.questionnaire_id == 89.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_challenge'})
# All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count) # All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count)
extracted_ers = extracted_ers.join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \ extracted_ers = extracted_ers.join(session_start_timestamp, on=['device_id', 'esm_session'], how='inner') \
.join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \
.join(se_stressfulness_event_tg, on=['device_id', 'esm_session'], how='inner') \ .join(se_stressfulness_event_tg, on=['device_id', 'esm_session'], how='inner') \
.join(se_time, on=['device_id', 'esm_session'], how='left') \ .join(se_time, on=['device_id', 'esm_session'], how='left') \
.join(se_duration, on=['device_id', 'esm_session'], how='left') \ .join(se_duration, on=['device_id', 'esm_session'], how='left') \
@ -149,16 +159,17 @@ def extract_ers(esm_df):
extracted_ers.reset_index(drop=True, inplace=True) extracted_ers.reset_index(drop=True, inplace=True)
# Add default duration in case if participant answered that no stressful event occured # Add default duration in case if participant answered that no stressful event occured
def_time_before_questionnaire = 25 * 60 # in seconds (25 minutes.. 5 minutes will be added later) - * 1000 to standardize it in miliseconds extracted_ers["se_duration"] = extracted_ers["se_duration"].fillna(int((ioi + 2*ioi_padding) * 1000))
extracted_ers["se_duration"] = extracted_ers["se_duration"].fillna((extracted_ers["session_length"] + def_time_before_questionnaire).astype(int) * 1000)
# Prepare data to fit the data structure in the CSV file ... # Prepare data to fit the data structure in the CSV file ...
# Add the event time as the end of the questionnaire if no stress event occured # Add the event time as the end of the questionnaire if no stress event occured
extracted_ers['se_time'] = extracted_ers['se_time'].fillna(extracted_ers['session_end_timestamp']) extracted_ers['se_time'] = extracted_ers['se_time'].fillna(extracted_ers['session_start_timestamp'])
# Type could be an int (timestamp [ms]) which stays the same, and datetime str which is converted to timestamp in miliseconds # Type could be an int (timestamp [ms]) which stays the same, and datetime str which is converted to timestamp in miliseconds
extracted_ers['event_timestamp'] = extracted_ers['se_time'].apply(lambda x: x if isinstance(x, int) else pd.to_datetime(x).timestamp() * 1000).astype('int64') extracted_ers['event_timestamp'] = extracted_ers['se_time'].apply(lambda x: x if isinstance(x, int) else pd.to_datetime(x).timestamp() * 1000).astype('int64')
extracted_ers['shift_direction'] = -1 extracted_ers['shift_direction'] = -1
""">>>>> begin section (could be optimized) <<<<<"""
# Checks whether the duration is marked with "1 - It's still ongoing" which means that the end of the current questionnaire # Checks whether the duration is marked with "1 - It's still ongoing" which means that the end of the current questionnaire
# is taken as end time of the segment. Else the user input duration is taken. # is taken as end time of the segment. Else the user input duration is taken.
extracted_ers['se_duration'] = \ extracted_ers['se_duration'] = \
@ -169,14 +180,29 @@ def extract_ers(esm_df):
) )
# This converts the rows of timestamps in miliseconds and the rows with datetime... to timestamp in seconds. # This converts the rows of timestamps in miliseconds and the rows with datetime... to timestamp in seconds.
time_before_event = 5 * 60 # in seconds (5 minutes)
extracted_ers['se_duration'] = \ extracted_ers['se_duration'] = \
extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else (pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60) + time_before_event extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else (pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60)
extracted_ers['shift'] = format_timestamp(time_before_event) # Check whether min se_duration is at least the same duration as the ioi. Filter-out the rest.
extracted_ers['length'] = extracted_ers['se_duration'].apply(lambda x: format_timestamp(x)) extracted_ers = extracted_ers[extracted_ers["se_duration"] >= ioi].reset_index(drop=True)
# Drop event_timestamp duplicates in case of user referencing the same event over multiple questionnaires """>>>>> end section <<<<<"""
# Simply override all remaining durations to be of an equal amount
extracted_ers['se_duration'] = ioi + 2*ioi_padding
#
extracted_ers['shift'] = \
np.where(
extracted_ers['appraisal_stressfulness_event'] == 0,
extracted_ers['se_duration'],
ioi_padding
)
extracted_ers['shift'] = extracted_ers['shift'].apply(lambda x: format_timestamp(int(x)))
extracted_ers['length'] = extracted_ers['se_duration'].apply(lambda x: format_timestamp(int(x)))
# Drop event_timestamp duplicates in case in the user is referencing the same event over multiple questionnaires
extracted_ers.drop_duplicates(subset=["event_timestamp"], keep='first', inplace=True) extracted_ers.drop_duplicates(subset=["event_timestamp"], keep='first', inplace=True)
extracted_ers.reset_index(drop=True, inplace=True) extracted_ers.reset_index(drop=True, inplace=True)