diff --git a/config.yaml b/config.yaml index 61344d29..d52804fe 100644 --- a/config.yaml +++ b/config.yaml @@ -27,6 +27,8 @@ TIME_SEGMENTS: &time_segments TAILORED_EVENTS: # Only relevant if TYPE=EVENT COMPUTE: True SEGMENTING_METHOD: "stress_event" # 30_before, 90_before, stress_event + INTERVAL_OF_INTEREST: 5 # duration of event of interest [minutes] + INTERVAL_OF_INTEREST_PADDING: 7.5 # interval of interest padding (before and after IOI) [minutes] # See https://www.rapids.science/latest/setup/configuration/#timezone-of-your-study TIMEZONE: diff --git a/src/features/all_cleaning_overall/straw/main.py b/src/features/all_cleaning_overall/straw/main.py index 197c285d..22e2dd8e 100644 --- a/src/features/all_cleaning_overall/straw/main.py +++ b/src/features/all_cleaning_overall/straw/main.py @@ -23,6 +23,9 @@ def straw_cleaning(sensor_data_files, provider, target): graph_bf_af(features, "1target_rows_before") + # TODO: need to check whether all of the participants contain E4 columns + sys.exit() + # (1.0) OVERRIDE STRESSFULNESS EVENT TARGETS IF ERS SEGMENTING_METHOD IS "STRESS_EVENT" if config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"] == "stress_event": diff --git a/src/features/phone_esm/straw/process_user_event_related_segments.py b/src/features/phone_esm/straw/process_user_event_related_segments.py index eb6ba898..cd091b2c 100644 --- a/src/features/phone_esm/straw/process_user_event_related_segments.py +++ b/src/features/phone_esm/straw/process_user_event_related_segments.py @@ -49,7 +49,8 @@ def extract_ers(esm_df): extracted_ers (DataFrame): dataframe with all necessary information to write event-related segments file in the correct format. """ - pd.set_option("display.max_rows", 50) + + pd.set_option("display.max_rows", 100) pd.set_option("display.max_columns", None) with open('config.yaml', 'r') as stream: @@ -105,7 +106,9 @@ def extract_ers(esm_df): extracted_ers["shift"] = extracted_ers["diffs"].apply(lambda x: format_timestamp(x)) elif segmenting_method == "stress_event": - """This is a special case of the method as it consists of two important parts: + """ + TODO: update documentation for this condition + This is a special case of the method as it consists of two important parts: (1) Generating of the ERS file (same as the methods above) and (2) Generating targets file alongside with the correct time segment labels. @@ -120,22 +123,29 @@ def extract_ers(esm_df): By default, this method also excludes all events that are longer then 2.5 hours so that the segments are easily comparable. """ + ioi = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["INTERVAL_OF_INTEREST"] * 60 # interval of interest in seconds + ioi_padding = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["INTERVAL_OF_INTEREST_PADDING"] * 60 # interval of interest padding in seconds + # Get and join required data extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index().rename(columns={'timestamp': 'session_length'}) # questionnaire length extracted_ers = extracted_ers[extracted_ers["session_length"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire answering is 15 min + session_start_timestamp = esm_df.groupby(['device_id', 'esm_session'])['timestamp'].min().to_frame().rename(columns={'timestamp': 'session_start_timestamp'}) # questionnaire start timestamp session_end_timestamp = esm_df.groupby(['device_id', 'esm_session'])['timestamp'].max().to_frame().rename(columns={'timestamp': 'session_end_timestamp'}) # questionnaire end timestamp + # Users' answers for the stressfulness event (se) start times and durations se_time = esm_df[esm_df.questionnaire_id == 90.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_time'}) se_duration = esm_df[esm_df.questionnaire_id == 91.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_duration'}) - - # Extracted 3 targets that will be transfered with the csv file to the cleaning script. + # Make se_durations to the appropriate lengths + + # Extracted 3 targets that will be transfered in the csv file to the cleaning script. se_stressfulness_event_tg = esm_df[esm_df.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_stressfulness_event'}) se_threat_tg = esm_df[esm_df.questionnaire_id == 88.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_threat'}) se_challenge_tg = esm_df[esm_df.questionnaire_id == 89.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_challenge'}) # All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count) - extracted_ers = extracted_ers.join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \ + extracted_ers = extracted_ers.join(session_start_timestamp, on=['device_id', 'esm_session'], how='inner') \ + .join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \ .join(se_stressfulness_event_tg, on=['device_id', 'esm_session'], how='inner') \ .join(se_time, on=['device_id', 'esm_session'], how='left') \ .join(se_duration, on=['device_id', 'esm_session'], how='left') \ @@ -149,16 +159,17 @@ def extract_ers(esm_df): extracted_ers.reset_index(drop=True, inplace=True) # Add default duration in case if participant answered that no stressful event occured - def_time_before_questionnaire = 25 * 60 # in seconds (25 minutes.. 5 minutes will be added later) - * 1000 to standardize it in miliseconds - extracted_ers["se_duration"] = extracted_ers["se_duration"].fillna((extracted_ers["session_length"] + def_time_before_questionnaire).astype(int) * 1000) + extracted_ers["se_duration"] = extracted_ers["se_duration"].fillna(int((ioi + 2*ioi_padding) * 1000)) # Prepare data to fit the data structure in the CSV file ... # Add the event time as the end of the questionnaire if no stress event occured - extracted_ers['se_time'] = extracted_ers['se_time'].fillna(extracted_ers['session_end_timestamp']) + extracted_ers['se_time'] = extracted_ers['se_time'].fillna(extracted_ers['session_start_timestamp']) # Type could be an int (timestamp [ms]) which stays the same, and datetime str which is converted to timestamp in miliseconds extracted_ers['event_timestamp'] = extracted_ers['se_time'].apply(lambda x: x if isinstance(x, int) else pd.to_datetime(x).timestamp() * 1000).astype('int64') extracted_ers['shift_direction'] = -1 - + + """>>>>> begin section (could be optimized) <<<<<""" + # Checks whether the duration is marked with "1 - It's still ongoing" which means that the end of the current questionnaire # is taken as end time of the segment. Else the user input duration is taken. extracted_ers['se_duration'] = \ @@ -169,14 +180,29 @@ def extract_ers(esm_df): ) # This converts the rows of timestamps in miliseconds and the rows with datetime... to timestamp in seconds. - time_before_event = 5 * 60 # in seconds (5 minutes) extracted_ers['se_duration'] = \ - extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else (pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60) + time_before_event + extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else (pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60) - extracted_ers['shift'] = format_timestamp(time_before_event) - extracted_ers['length'] = extracted_ers['se_duration'].apply(lambda x: format_timestamp(x)) + # Check whether min se_duration is at least the same duration as the ioi. Filter-out the rest. + extracted_ers = extracted_ers[extracted_ers["se_duration"] >= ioi].reset_index(drop=True) - # Drop event_timestamp duplicates in case of user referencing the same event over multiple questionnaires + """>>>>> end section <<<<<""" + + # Simply override all remaining durations to be of an equal amount + extracted_ers['se_duration'] = ioi + 2*ioi_padding + + # + extracted_ers['shift'] = \ + np.where( + extracted_ers['appraisal_stressfulness_event'] == 0, + extracted_ers['se_duration'], + ioi_padding + ) + + extracted_ers['shift'] = extracted_ers['shift'].apply(lambda x: format_timestamp(int(x))) + extracted_ers['length'] = extracted_ers['se_duration'].apply(lambda x: format_timestamp(int(x))) + + # Drop event_timestamp duplicates in case in the user is referencing the same event over multiple questionnaires extracted_ers.drop_duplicates(subset=["event_timestamp"], keep='first', inplace=True) extracted_ers.reset_index(drop=True, inplace=True) @@ -188,7 +214,7 @@ def extract_ers(esm_df): else: raise Exception("Please select correct target method for the event-related segments.") extracted_ers = pd.DataFrame(columns=["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"]) - + return extracted_ers[["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"]]