Fix a bug related to wrong user input (duplicated events).

imputation_and_cleaning
Primoz 2022-11-15 09:53:31 +00:00
parent bd41f42a5d
commit 621f11b2d9
2 changed files with 17 additions and 15 deletions

BIN
NaN.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 22 KiB

View File

@ -67,10 +67,10 @@ def extract_ers(esm_df):
segmenting_method = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"] segmenting_method = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"]
if segmenting_method in ["30_before", "90_before"]: # takes 30-minute peroid before the questionnaire + the duration of the questionnaire if segmenting_method in ["30_before", "90_before"]: # takes 30-minute peroid before the questionnaire + the duration of the questionnaire
""" '30-minutes and 90-minutes before' have the same fundamental logic with couple of deviations that will be explained below. """ '30-minutes and 90-minutes before' have the same fundamental logic with couple of deviations that will be explained below.
Both take x-minute period before the questionnaire that is summed with the questionnaire duration. Both take x-minute period before the questionnaire that is summed with the questionnaire duration.
All questionnaire durations over 15 minutes are excluded from the querying. All questionnaire durations over 15 minutes are excluded from the querying.
""" """
# Extract time-relevant information # Extract time-relevant information
extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index() # questionnaire length extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index() # questionnaire length
extracted_ers["label"] = f"straw_event_{segmenting_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3) extracted_ers["label"] = f"straw_event_{segmenting_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
@ -79,9 +79,9 @@ def extract_ers(esm_df):
extracted_ers["shift_direction"] = -1 extracted_ers["shift_direction"] = -1
if segmenting_method == "30_before": if segmenting_method == "30_before":
"""The method 30-minutes before simply takes 30 minutes before the questionnaire and sums it with the questionnaire duration. """The method 30-minutes before simply takes 30 minutes before the questionnaire and sums it with the questionnaire duration.
The timestamps are formatted with the help of format_timestamp() method. The timestamps are formatted with the help of format_timestamp() method.
""" """
time_before_questionnaire = 30 * 60 # in seconds (30 minutes) time_before_questionnaire = 30 * 60 # in seconds (30 minutes)
extracted_ers["length"] = (extracted_ers["timestamp"] + time_before_questionnaire).apply(lambda x: format_timestamp(x)) extracted_ers["length"] = (extracted_ers["timestamp"] + time_before_questionnaire).apply(lambda x: format_timestamp(x))
@ -89,9 +89,9 @@ def extract_ers(esm_df):
extracted_ers["shift"] = extracted_ers["shift"].apply(lambda x: format_timestamp(x)) extracted_ers["shift"] = extracted_ers["shift"].apply(lambda x: format_timestamp(x))
elif segmenting_method == "90_before": elif segmenting_method == "90_before":
"""The method 90-minutes before has an important condition. If the time between the current and the previous questionnaire is """The method 90-minutes before has an important condition. If the time between the current and the previous questionnaire is
longer then 90 minutes it takes 90 minutes, otherwise it takes the original time difference between the questionnaires. longer then 90 minutes it takes 90 minutes, otherwise it takes the original time difference between the questionnaires.
""" """
time_before_questionnaire = 90 * 60 # in seconds (90 minutes) time_before_questionnaire = 90 * 60 # in seconds (90 minutes)
extracted_ers[['end_event_timestamp', 'device_id']] = esm_df.groupby(["device_id", "esm_session"])['timestamp'].max().reset_index()[['timestamp', 'device_id']] extracted_ers[['end_event_timestamp', 'device_id']] = esm_df.groupby(["device_id", "esm_session"])['timestamp'].max().reset_index()[['timestamp', 'device_id']]
@ -135,7 +135,7 @@ def extract_ers(esm_df):
extracted_ers = extracted_ers[(~extracted_ers.se_time.str.startswith("0 - ")) & (~extracted_ers.se_duration.str.startswith("0 - "))] extracted_ers = extracted_ers[(~extracted_ers.se_time.str.startswith("0 - ")) & (~extracted_ers.se_duration.str.startswith("0 - "))]
# Transform data into its final form, ready for the extraction # Transform data into its final form, ready for the extraction
extracted_ers.reset_index(inplace=True) extracted_ers.reset_index(drop=True, inplace=True)
time_before_event = 5 * 60 # in seconds (5 minutes) time_before_event = 5 * 60 # in seconds (5 minutes)
extracted_ers['event_timestamp'] = pd.to_datetime(extracted_ers['se_time']).apply(lambda x: x.timestamp() * 1000).astype('int64') extracted_ers['event_timestamp'] = pd.to_datetime(extracted_ers['se_time']).apply(lambda x: x.timestamp() * 1000).astype('int64')
@ -154,13 +154,15 @@ def extract_ers(esm_df):
extracted_ers['se_duration'] = \ extracted_ers['se_duration'] = \
extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else (pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60) + time_before_event extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else (pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60) + time_before_event
# Exclude events that are longer than 2.5 hours
extracted_ers = extracted_ers[extracted_ers["se_duration"] <= 2.5 * 60 * 60].reset_index(drop=True)
extracted_ers["label"] = f"straw_event_{segmenting_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
extracted_ers['shift'] = format_timestamp(time_before_event) extracted_ers['shift'] = format_timestamp(time_before_event)
extracted_ers['length'] = extracted_ers['se_duration'].apply(lambda x: format_timestamp(x)) extracted_ers['length'] = extracted_ers['se_duration'].apply(lambda x: format_timestamp(x))
# Drop event_timestamp duplicates in case of user referencing the same event over multiple questionnaires
extracted_ers.drop_duplicates(subset=["event_timestamp"], keep='first', inplace=True)
extracted_ers.reset_index(drop=True, inplace=True)
extracted_ers["label"] = f"straw_event_{segmenting_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
# Write the csv of extracted ERS labels with targets (stress event intensity) # Write the csv of extracted ERS labels with targets (stress event intensity)
extracted_ers[["label", "intensity"]].to_csv(snakemake.output[1], index=False) extracted_ers[["label", "intensity"]].to_csv(snakemake.output[1], index=False)