Fix a bug related to wrong user input (duplicated events).

imputation_and_cleaning
Primoz 2022-11-15 09:53:31 +00:00
parent bd41f42a5d
commit 621f11b2d9
2 changed files with 17 additions and 15 deletions

BIN
NaN.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 22 KiB

View File

@ -135,7 +135,7 @@ def extract_ers(esm_df):
extracted_ers = extracted_ers[(~extracted_ers.se_time.str.startswith("0 - ")) & (~extracted_ers.se_duration.str.startswith("0 - "))] extracted_ers = extracted_ers[(~extracted_ers.se_time.str.startswith("0 - ")) & (~extracted_ers.se_duration.str.startswith("0 - "))]
# Transform data into its final form, ready for the extraction # Transform data into its final form, ready for the extraction
extracted_ers.reset_index(inplace=True) extracted_ers.reset_index(drop=True, inplace=True)
time_before_event = 5 * 60 # in seconds (5 minutes) time_before_event = 5 * 60 # in seconds (5 minutes)
extracted_ers['event_timestamp'] = pd.to_datetime(extracted_ers['se_time']).apply(lambda x: x.timestamp() * 1000).astype('int64') extracted_ers['event_timestamp'] = pd.to_datetime(extracted_ers['se_time']).apply(lambda x: x.timestamp() * 1000).astype('int64')
@ -154,13 +154,15 @@ def extract_ers(esm_df):
extracted_ers['se_duration'] = \ extracted_ers['se_duration'] = \
extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else (pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60) + time_before_event extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else (pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60) + time_before_event
# Exclude events that are longer than 2.5 hours
extracted_ers = extracted_ers[extracted_ers["se_duration"] <= 2.5 * 60 * 60].reset_index(drop=True)
extracted_ers["label"] = f"straw_event_{segmenting_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
extracted_ers['shift'] = format_timestamp(time_before_event) extracted_ers['shift'] = format_timestamp(time_before_event)
extracted_ers['length'] = extracted_ers['se_duration'].apply(lambda x: format_timestamp(x)) extracted_ers['length'] = extracted_ers['se_duration'].apply(lambda x: format_timestamp(x))
# Drop event_timestamp duplicates in case of user referencing the same event over multiple questionnaires
extracted_ers.drop_duplicates(subset=["event_timestamp"], keep='first', inplace=True)
extracted_ers.reset_index(drop=True, inplace=True)
extracted_ers["label"] = f"straw_event_{segmenting_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
# Write the csv of extracted ERS labels with targets (stress event intensity) # Write the csv of extracted ERS labels with targets (stress event intensity)
extracted_ers[["label", "intensity"]].to_csv(snakemake.output[1], index=False) extracted_ers[["label", "intensity"]].to_csv(snakemake.output[1], index=False)