Fix a bug related to wrong user input (duplicated events).

2022-11-15 09:53:31 +00:00 · 2022-11-15 09:53:31 +00:00 · 621f11b2d9
parent bd41f42a5d
commit 621f11b2d9
2 changed files with 17 additions and 15 deletions
--- a/NaN.png
+++ b/NaN.png
--- a/src/features/phone_esm/straw/process_user_event_related_segments.py
+++ b/src/features/phone_esm/straw/process_user_event_related_segments.py
@ -67,10 +67,10 @@ def extract_ers(esm_df):
    segmenting_method = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"]
    
    if segmenting_method in ["30_before", "90_before"]: # takes 30-minute peroid before the questionnaire + the duration of the questionnaire
-    """ '30-minutes and 90-minutes before' have the same fundamental logic with couple of deviations that will be explained below.
-    Both take x-minute period before the questionnaire that is summed with the questionnaire duration.
-    All questionnaire durations over 15 minutes are excluded from the querying.
-    """
+        """ '30-minutes and 90-minutes before' have the same fundamental logic with couple of deviations that will be explained below.
+        Both take x-minute period before the questionnaire that is summed with the questionnaire duration.
+        All questionnaire durations over 15 minutes are excluded from the querying.
+        """
        # Extract time-relevant information
        extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index() # questionnaire length
        extracted_ers["label"] = f"straw_event_{segmenting_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3) 
@ -79,9 +79,9 @@ def extract_ers(esm_df):
        extracted_ers["shift_direction"] = -1 

        if segmenting_method == "30_before":
-        """The method 30-minutes before simply takes 30 minutes before the questionnaire and sums it with the questionnaire duration.
-        The timestamps are formatted with the help of format_timestamp() method.
-        """
+            """The method 30-minutes before simply takes 30 minutes before the questionnaire and sums it with the questionnaire duration.
+            The timestamps are formatted with the help of format_timestamp() method.
+            """
            time_before_questionnaire = 30 * 60 # in seconds (30 minutes)

            extracted_ers["length"] = (extracted_ers["timestamp"] + time_before_questionnaire).apply(lambda x: format_timestamp(x))
@ -89,9 +89,9 @@ def extract_ers(esm_df):
            extracted_ers["shift"] = extracted_ers["shift"].apply(lambda x: format_timestamp(x))
        
        elif segmenting_method == "90_before":
-        """The method 90-minutes before has an important condition. If the time between the current and the previous questionnaire is
-        longer then 90 minutes it takes 90 minutes, otherwise it takes the original time difference between the questionnaires.
-        """
+            """The method 90-minutes before has an important condition. If the time between the current and the previous questionnaire is
+            longer then 90 minutes it takes 90 minutes, otherwise it takes the original time difference between the questionnaires.
+            """
            time_before_questionnaire = 90 * 60 # in seconds (90 minutes)

            extracted_ers[['end_event_timestamp', 'device_id']] = esm_df.groupby(["device_id", "esm_session"])['timestamp'].max().reset_index()[['timestamp', 'device_id']]
@ -135,7 +135,7 @@ def extract_ers(esm_df):
        extracted_ers = extracted_ers[(~extracted_ers.se_time.str.startswith("0 - ")) & (~extracted_ers.se_duration.str.startswith("0 - "))]

        # Transform data into its final form, ready for the extraction
-        extracted_ers.reset_index(inplace=True)
+        extracted_ers.reset_index(drop=True, inplace=True)

        time_before_event = 5 * 60 # in seconds (5 minutes)
        extracted_ers['event_timestamp'] = pd.to_datetime(extracted_ers['se_time']).apply(lambda x: x.timestamp() * 1000).astype('int64')
@ -154,13 +154,15 @@ def extract_ers(esm_df):
        extracted_ers['se_duration'] = \
            extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else (pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60) + time_before_event

-        # Exclude events that are longer than 2.5 hours
-        extracted_ers = extracted_ers[extracted_ers["se_duration"] <= 2.5 * 60 * 60].reset_index(drop=True) 
-
-        extracted_ers["label"] = f"straw_event_{segmenting_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
        extracted_ers['shift'] = format_timestamp(time_before_event)
        extracted_ers['length'] = extracted_ers['se_duration'].apply(lambda x: format_timestamp(x))

+        # Drop event_timestamp duplicates in case of user referencing the same event over multiple questionnaires
+        extracted_ers.drop_duplicates(subset=["event_timestamp"], keep='first', inplace=True)
+        extracted_ers.reset_index(drop=True, inplace=True)
+
+        extracted_ers["label"] = f"straw_event_{segmenting_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
+
        # Write the csv of extracted ERS labels with targets (stress event intensity)   
        extracted_ers[["label", "intensity"]].to_csv(snakemake.output[1], index=False)