Fix a bug related to wrong user input (duplicated events).
parent
bd41f42a5d
commit
621f11b2d9
|
@ -67,10 +67,10 @@ def extract_ers(esm_df):
|
||||||
segmenting_method = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"]
|
segmenting_method = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"]
|
||||||
|
|
||||||
if segmenting_method in ["30_before", "90_before"]: # takes 30-minute peroid before the questionnaire + the duration of the questionnaire
|
if segmenting_method in ["30_before", "90_before"]: # takes 30-minute peroid before the questionnaire + the duration of the questionnaire
|
||||||
""" '30-minutes and 90-minutes before' have the same fundamental logic with couple of deviations that will be explained below.
|
""" '30-minutes and 90-minutes before' have the same fundamental logic with couple of deviations that will be explained below.
|
||||||
Both take x-minute period before the questionnaire that is summed with the questionnaire duration.
|
Both take x-minute period before the questionnaire that is summed with the questionnaire duration.
|
||||||
All questionnaire durations over 15 minutes are excluded from the querying.
|
All questionnaire durations over 15 minutes are excluded from the querying.
|
||||||
"""
|
"""
|
||||||
# Extract time-relevant information
|
# Extract time-relevant information
|
||||||
extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index() # questionnaire length
|
extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index() # questionnaire length
|
||||||
extracted_ers["label"] = f"straw_event_{segmenting_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
|
extracted_ers["label"] = f"straw_event_{segmenting_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
|
||||||
|
@ -79,9 +79,9 @@ def extract_ers(esm_df):
|
||||||
extracted_ers["shift_direction"] = -1
|
extracted_ers["shift_direction"] = -1
|
||||||
|
|
||||||
if segmenting_method == "30_before":
|
if segmenting_method == "30_before":
|
||||||
"""The method 30-minutes before simply takes 30 minutes before the questionnaire and sums it with the questionnaire duration.
|
"""The method 30-minutes before simply takes 30 minutes before the questionnaire and sums it with the questionnaire duration.
|
||||||
The timestamps are formatted with the help of format_timestamp() method.
|
The timestamps are formatted with the help of format_timestamp() method.
|
||||||
"""
|
"""
|
||||||
time_before_questionnaire = 30 * 60 # in seconds (30 minutes)
|
time_before_questionnaire = 30 * 60 # in seconds (30 minutes)
|
||||||
|
|
||||||
extracted_ers["length"] = (extracted_ers["timestamp"] + time_before_questionnaire).apply(lambda x: format_timestamp(x))
|
extracted_ers["length"] = (extracted_ers["timestamp"] + time_before_questionnaire).apply(lambda x: format_timestamp(x))
|
||||||
|
@ -89,9 +89,9 @@ def extract_ers(esm_df):
|
||||||
extracted_ers["shift"] = extracted_ers["shift"].apply(lambda x: format_timestamp(x))
|
extracted_ers["shift"] = extracted_ers["shift"].apply(lambda x: format_timestamp(x))
|
||||||
|
|
||||||
elif segmenting_method == "90_before":
|
elif segmenting_method == "90_before":
|
||||||
"""The method 90-minutes before has an important condition. If the time between the current and the previous questionnaire is
|
"""The method 90-minutes before has an important condition. If the time between the current and the previous questionnaire is
|
||||||
longer then 90 minutes it takes 90 minutes, otherwise it takes the original time difference between the questionnaires.
|
longer then 90 minutes it takes 90 minutes, otherwise it takes the original time difference between the questionnaires.
|
||||||
"""
|
"""
|
||||||
time_before_questionnaire = 90 * 60 # in seconds (90 minutes)
|
time_before_questionnaire = 90 * 60 # in seconds (90 minutes)
|
||||||
|
|
||||||
extracted_ers[['end_event_timestamp', 'device_id']] = esm_df.groupby(["device_id", "esm_session"])['timestamp'].max().reset_index()[['timestamp', 'device_id']]
|
extracted_ers[['end_event_timestamp', 'device_id']] = esm_df.groupby(["device_id", "esm_session"])['timestamp'].max().reset_index()[['timestamp', 'device_id']]
|
||||||
|
@ -135,7 +135,7 @@ def extract_ers(esm_df):
|
||||||
extracted_ers = extracted_ers[(~extracted_ers.se_time.str.startswith("0 - ")) & (~extracted_ers.se_duration.str.startswith("0 - "))]
|
extracted_ers = extracted_ers[(~extracted_ers.se_time.str.startswith("0 - ")) & (~extracted_ers.se_duration.str.startswith("0 - "))]
|
||||||
|
|
||||||
# Transform data into its final form, ready for the extraction
|
# Transform data into its final form, ready for the extraction
|
||||||
extracted_ers.reset_index(inplace=True)
|
extracted_ers.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
time_before_event = 5 * 60 # in seconds (5 minutes)
|
time_before_event = 5 * 60 # in seconds (5 minutes)
|
||||||
extracted_ers['event_timestamp'] = pd.to_datetime(extracted_ers['se_time']).apply(lambda x: x.timestamp() * 1000).astype('int64')
|
extracted_ers['event_timestamp'] = pd.to_datetime(extracted_ers['se_time']).apply(lambda x: x.timestamp() * 1000).astype('int64')
|
||||||
|
@ -154,13 +154,15 @@ def extract_ers(esm_df):
|
||||||
extracted_ers['se_duration'] = \
|
extracted_ers['se_duration'] = \
|
||||||
extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else (pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60) + time_before_event
|
extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else (pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60) + time_before_event
|
||||||
|
|
||||||
# Exclude events that are longer than 2.5 hours
|
|
||||||
extracted_ers = extracted_ers[extracted_ers["se_duration"] <= 2.5 * 60 * 60].reset_index(drop=True)
|
|
||||||
|
|
||||||
extracted_ers["label"] = f"straw_event_{segmenting_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
|
|
||||||
extracted_ers['shift'] = format_timestamp(time_before_event)
|
extracted_ers['shift'] = format_timestamp(time_before_event)
|
||||||
extracted_ers['length'] = extracted_ers['se_duration'].apply(lambda x: format_timestamp(x))
|
extracted_ers['length'] = extracted_ers['se_duration'].apply(lambda x: format_timestamp(x))
|
||||||
|
|
||||||
|
# Drop event_timestamp duplicates in case of user referencing the same event over multiple questionnaires
|
||||||
|
extracted_ers.drop_duplicates(subset=["event_timestamp"], keep='first', inplace=True)
|
||||||
|
extracted_ers.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
|
extracted_ers["label"] = f"straw_event_{segmenting_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
|
||||||
|
|
||||||
# Write the csv of extracted ERS labels with targets (stress event intensity)
|
# Write the csv of extracted ERS labels with targets (stress event intensity)
|
||||||
extracted_ers[["label", "intensity"]].to_csv(snakemake.output[1], index=False)
|
extracted_ers[["label", "intensity"]].to_csv(snakemake.output[1], index=False)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue