Make stress events to be equal in duration.

2022-12-14 14:52:20 +00:00 · 2022-12-14 14:52:20 +00:00 · 7f5a4e6744
parent 3ce7f2c2a5
commit 7f5a4e6744
3 changed files with 46 additions and 15 deletions
--- a/config.yaml
+++ b/config.yaml
@ -27,6 +27,8 @@ TIME_SEGMENTS: &time_segments
  TAILORED_EVENTS: # Only relevant if TYPE=EVENT
    COMPUTE: True
    SEGMENTING_METHOD: "stress_event" # 30_before, 90_before, stress_event
    INTERVAL_OF_INTEREST: 5 # duration of event of interest [minutes]
    INTERVAL_OF_INTEREST_PADDING: 7.5 # interval of interest padding (before and after IOI) [minutes]
 # See https://www.rapids.science/latest/setup/configuration/#timezone-of-your-study
 TIMEZONE: 
--- a/src/features/all_cleaning_overall/straw/main.py
+++ b/src/features/all_cleaning_overall/straw/main.py
@ -23,6 +23,9 @@ def straw_cleaning(sensor_data_files, provider, target):
    graph_bf_af(features, "1target_rows_before")
    # TODO: need to check whether all of the participants contain E4 columns
    sys.exit()
    # (1.0) OVERRIDE STRESSFULNESS EVENT TARGETS IF ERS SEGMENTING_METHOD IS "STRESS_EVENT"
    if config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"] == "stress_event": 
--- a/src/features/phone_esm/straw/process_user_event_related_segments.py
+++ b/src/features/phone_esm/straw/process_user_event_related_segments.py
@ -49,7 +49,8 @@ def extract_ers(esm_df):
        extracted_ers (DataFrame): dataframe with all necessary information to write event-related segments file 
        in the correct format.
    """
-    pd.set_option("display.max_rows", 50)
+
    pd.set_option("display.max_rows", 100)
    pd.set_option("display.max_columns", None)
    with open('config.yaml', 'r') as stream:
@ -105,7 +106,9 @@ def extract_ers(esm_df):
            extracted_ers["shift"] = extracted_ers["diffs"].apply(lambda x: format_timestamp(x))
    elif segmenting_method == "stress_event":
-        """This is a special case of the method as it consists of two important parts:
+        """
        TODO: update documentation for this condition
        This is a special case of the method as it consists of two important parts:
            (1) Generating of the ERS file (same as the methods above) and
            (2) Generating targets file alongside with the correct time segment labels.
@ -120,22 +123,29 @@ def extract_ers(esm_df):
        By default, this method also excludes all events that are longer then 2.5 hours so that the segments are easily comparable. 
        """
        ioi = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["INTERVAL_OF_INTEREST"] * 60 # interval of interest in seconds
        ioi_padding = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["INTERVAL_OF_INTEREST_PADDING"]  * 60 # interval of interest padding in seconds 
        # Get and join required data
        extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index().rename(columns={'timestamp': 'session_length'}) # questionnaire length
        extracted_ers = extracted_ers[extracted_ers["session_length"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire answering is 15 min
        session_start_timestamp = esm_df.groupby(['device_id', 'esm_session'])['timestamp'].min().to_frame().rename(columns={'timestamp': 'session_start_timestamp'}) # questionnaire start timestamp
        session_end_timestamp = esm_df.groupby(['device_id', 'esm_session'])['timestamp'].max().to_frame().rename(columns={'timestamp': 'session_end_timestamp'}) # questionnaire end timestamp
        # Users' answers for the stressfulness event (se) start times and durations 
        se_time = esm_df[esm_df.questionnaire_id == 90.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_time'})
        se_duration = esm_df[esm_df.questionnaire_id == 91.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_duration'})
-        
+        # Make se_durations to the appropriate lengths
-        # Extracted 3 targets that will be transfered with the csv file to the cleaning script. 
+
        # Extracted 3 targets that will be transfered in the csv file to the cleaning script. 
        se_stressfulness_event_tg = esm_df[esm_df.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_stressfulness_event'})
        se_threat_tg = esm_df[esm_df.questionnaire_id == 88.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_threat'})
        se_challenge_tg = esm_df[esm_df.questionnaire_id == 89.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_challenge'})
        # All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count)
-        extracted_ers = extracted_ers.join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \
+        extracted_ers = extracted_ers.join(session_start_timestamp, on=['device_id', 'esm_session'], how='inner') \
                                     .join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \
                                     .join(se_stressfulness_event_tg, on=['device_id', 'esm_session'], how='inner') \
                                     .join(se_time, on=['device_id', 'esm_session'], how='left') \
                                     .join(se_duration, on=['device_id', 'esm_session'], how='left') \
@ -149,16 +159,17 @@ def extract_ers(esm_df):
        extracted_ers.reset_index(drop=True, inplace=True)
        # Add default duration in case if participant answered that no stressful event occured
-        def_time_before_questionnaire = 25 * 60 # in seconds (25 minutes.. 5 minutes will be added later) - * 1000 to standardize it in miliseconds
+        extracted_ers["se_duration"] = extracted_ers["se_duration"].fillna(int((ioi + 2*ioi_padding) * 1000))
        extracted_ers["se_duration"] = extracted_ers["se_duration"].fillna((extracted_ers["session_length"] + def_time_before_questionnaire).astype(int) * 1000)
        # Prepare data to fit the data structure in the CSV file ...
        # Add the event time as the end of the questionnaire if no stress event occured
-        extracted_ers['se_time'] = extracted_ers['se_time'].fillna(extracted_ers['session_end_timestamp'])
+        extracted_ers['se_time'] = extracted_ers['se_time'].fillna(extracted_ers['session_start_timestamp'])
        # Type could be an int (timestamp [ms]) which stays the same, and datetime str which is converted to timestamp in miliseconds 
        extracted_ers['event_timestamp'] = extracted_ers['se_time'].apply(lambda x: x if isinstance(x, int) else pd.to_datetime(x).timestamp() * 1000).astype('int64')
        extracted_ers['shift_direction'] = -1
-        
+
        """>>>>> begin section (could be optimized) <<<<<"""
        # Checks whether the duration is marked with "1 - It's still ongoing" which means that the end of the current questionnaire
        # is taken as end time of the segment. Else the user input duration is taken. 
        extracted_ers['se_duration'] = \
@ -169,14 +180,29 @@ def extract_ers(esm_df):
            )
        # This converts the rows of timestamps in miliseconds and the rows with datetime... to timestamp in seconds.
        time_before_event = 5 * 60 # in seconds (5 minutes)
        extracted_ers['se_duration'] = \
-            extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else (pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60) + time_before_event
+            extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else (pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60)
-        extracted_ers['shift'] = format_timestamp(time_before_event)
+        # Check whether min se_duration is at least the same duration as the ioi. Filter-out the rest.
-        extracted_ers['length'] = extracted_ers['se_duration'].apply(lambda x: format_timestamp(x))
+        extracted_ers = extracted_ers[extracted_ers["se_duration"] >= ioi].reset_index(drop=True)
-        # Drop event_timestamp duplicates in case of user referencing the same event over multiple questionnaires
+        """>>>>> end section <<<<<"""
        # Simply override all remaining durations to be of an equal amount
        extracted_ers['se_duration'] = ioi + 2*ioi_padding 
        # 
        extracted_ers['shift'] = \
            np.where(
                extracted_ers['appraisal_stressfulness_event'] == 0,
                extracted_ers['se_duration'], 
                ioi_padding
            )
        extracted_ers['shift'] = extracted_ers['shift'].apply(lambda x: format_timestamp(int(x)))
        extracted_ers['length'] = extracted_ers['se_duration'].apply(lambda x: format_timestamp(int(x)))
        # Drop event_timestamp duplicates in case in the user is referencing the same event over multiple questionnaires
        extracted_ers.drop_duplicates(subset=["event_timestamp"], keep='first', inplace=True)
        extracted_ers.reset_index(drop=True, inplace=True)
@ -188,7 +214,7 @@ def extract_ers(esm_df):
    else:
        raise Exception("Please select correct target method for the event-related segments.")
        extracted_ers = pd.DataFrame(columns=["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"])
-
+    
    return extracted_ers[["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"]]