Extract ERS and stress event targets to csv files (completed).
parent
9199b53ded
commit
a668b6e8da
|
@ -264,12 +264,14 @@ rule extract_event_information_from_esm:
|
||||||
script:
|
script:
|
||||||
"../src/features/phone_esm/straw/process_user_event_related_segments.py"
|
"../src/features/phone_esm/straw/process_user_event_related_segments.py"
|
||||||
|
|
||||||
rule create_event_related_segments_file:
|
rule merge_event_related_segments_files:
|
||||||
input:
|
input:
|
||||||
ers_files = expand("data/raw/ers/{pid}_ers.csv", pid=config["PIDS"])
|
ers_files = expand("data/raw/ers/{pid}_ers.csv", pid=config["PIDS"]),
|
||||||
|
se_files = expand("data/raw/ers/{pid}_stress_event_targets.csv", pid=config["PIDS"])
|
||||||
params:
|
params:
|
||||||
stage = "merge"
|
stage = "merge"
|
||||||
output:
|
output:
|
||||||
"data/external/straw_events.csv"
|
"data/external/straw_events.csv",
|
||||||
|
"data/external/stress_event_targets.csv"
|
||||||
script:
|
script:
|
||||||
"../src/features/phone_esm/straw/process_user_event_related_segments.py"
|
"../src/features/phone_esm/straw/process_user_event_related_segments.py"
|
|
@ -23,7 +23,7 @@ def format_timestamp(x):
|
||||||
return tstring
|
return tstring
|
||||||
|
|
||||||
|
|
||||||
def extract_ers_from_file(esm_df, device_id):
|
def extract_ers(esm_df, device_id):
|
||||||
|
|
||||||
pd.set_option("display.max_rows", 20)
|
pd.set_option("display.max_rows", 20)
|
||||||
pd.set_option("display.max_columns", None)
|
pd.set_option("display.max_columns", None)
|
||||||
|
@ -31,9 +31,7 @@ def extract_ers_from_file(esm_df, device_id):
|
||||||
with open('config.yaml', 'r') as stream:
|
with open('config.yaml', 'r') as stream:
|
||||||
config = yaml.load(stream, Loader=yaml.FullLoader)
|
config = yaml.load(stream, Loader=yaml.FullLoader)
|
||||||
|
|
||||||
|
pd.DataFrame(columns=["label", "intensity"]).to_csv(snakemake.output[1]) # Create an empty stress_events_targets file
|
||||||
pd.DataFrame().to_csv(snakemake.output[1]) # Create an empty stress event file either way TODO
|
|
||||||
|
|
||||||
|
|
||||||
esm_preprocessed = clean_up_esm(preprocess_esm(esm_df))
|
esm_preprocessed = clean_up_esm(preprocess_esm(esm_df))
|
||||||
|
|
||||||
|
@ -42,7 +40,6 @@ def extract_ers_from_file(esm_df, device_id):
|
||||||
esm_filtered_sessions = classified[classified["session_response"] == 'ema_completed'].reset_index()[['device_id', 'esm_session']]
|
esm_filtered_sessions = classified[classified["session_response"] == 'ema_completed'].reset_index()[['device_id', 'esm_session']]
|
||||||
esm_df = esm_preprocessed.loc[(esm_preprocessed['device_id'].isin(esm_filtered_sessions['device_id'])) & (esm_preprocessed['esm_session'].isin(esm_filtered_sessions['esm_session']))]
|
esm_df = esm_preprocessed.loc[(esm_preprocessed['device_id'].isin(esm_filtered_sessions['device_id'])) & (esm_preprocessed['esm_session'].isin(esm_filtered_sessions['esm_session']))]
|
||||||
|
|
||||||
|
|
||||||
targets_method = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["TARGETS_METHOD"]
|
targets_method = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["TARGETS_METHOD"]
|
||||||
if targets_method in ["30_before", "90_before"]: # takes 30-minute peroid before the questionnaire + the duration of the questionnaire
|
if targets_method in ["30_before", "90_before"]: # takes 30-minute peroid before the questionnaire + the duration of the questionnaire
|
||||||
# Extract time-relevant information
|
# Extract time-relevant information
|
||||||
|
@ -73,14 +70,12 @@ def extract_ers_from_file(esm_df, device_id):
|
||||||
extracted_ers["shift"] = extracted_ers["diffs"].apply(lambda x: format_timestamp(x))
|
extracted_ers["shift"] = extracted_ers["diffs"].apply(lambda x: format_timestamp(x))
|
||||||
|
|
||||||
elif targets_method == "stress_event":
|
elif targets_method == "stress_event":
|
||||||
# TODO: generiranje ERS datoteke za stress_events
|
|
||||||
|
|
||||||
# Get and join required data
|
# Get and join required data
|
||||||
extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index().rename(columns={'timestamp': 'session_length'}) # questionnaire end timestamp
|
extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index().rename(columns={'timestamp': 'session_length'}) # questionnaire end timestamp
|
||||||
session_end_timestamp = esm_df.groupby(['device_id', 'esm_session'])['timestamp'].max().to_frame().rename(columns={'timestamp': 'session_end_timestamp'}) # questionnaire end timestamp
|
session_end_timestamp = esm_df.groupby(['device_id', 'esm_session'])['timestamp'].max().to_frame().rename(columns={'timestamp': 'session_end_timestamp'}) # questionnaire end timestamp
|
||||||
se_time = esm_df[esm_df.questionnaire_id == 90.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_time'})
|
se_time = esm_df[esm_df.questionnaire_id == 90.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_time'})
|
||||||
se_duration = esm_df[esm_df.questionnaire_id == 91.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_duration'})
|
se_duration = esm_df[esm_df.questionnaire_id == 91.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_duration'})
|
||||||
se_intensity = esm_df[esm_df.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'se_intensity'})
|
se_intensity = esm_df[esm_df.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'intensity'})
|
||||||
|
|
||||||
extracted_ers = extracted_ers.join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \
|
extracted_ers = extracted_ers.join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \
|
||||||
.join(se_time, on=['device_id', 'esm_session'], how='inner') \
|
.join(se_time, on=['device_id', 'esm_session'], how='inner') \
|
||||||
|
@ -88,40 +83,30 @@ def extract_ers_from_file(esm_df, device_id):
|
||||||
.join(se_intensity, on=['device_id', 'esm_session'], how='inner')
|
.join(se_intensity, on=['device_id', 'esm_session'], how='inner')
|
||||||
|
|
||||||
# Filter sessions that are not useful
|
# Filter sessions that are not useful
|
||||||
extracted_ers = extracted_ers[(extracted_ers.se_time != "0 - Ne spomnim se")]
|
extracted_ers = extracted_ers[(extracted_ers.se_time != "0 - Ne spomnim se") & (extracted_ers.se_duration != "0 - Ne spomnim se")]
|
||||||
|
|
||||||
# Transform data into its final form, ready for the extraction
|
# Transform data into its final form, ready for the extraction
|
||||||
extracted_ers.reset_index(inplace=True)
|
extracted_ers.reset_index(inplace=True)
|
||||||
extracted_ers["label"] = f"straw_event_{targets_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
|
extracted_ers["label"] = f"straw_event_{targets_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
|
||||||
|
|
||||||
# Convert to unix timestamp
|
time_before_event = 10 * 60 # in seconds (10 minutes)
|
||||||
|
|
||||||
time_before_event = 90 * 60 # in seconds (10 minutes)
|
|
||||||
extracted_ers['event_timestamp'] = pd.to_datetime(extracted_ers['se_time']).apply(lambda x: x.timestamp() * 1000).astype('int64')
|
extracted_ers['event_timestamp'] = pd.to_datetime(extracted_ers['se_time']).apply(lambda x: x.timestamp() * 1000).astype('int64')
|
||||||
extracted_ers['shift'] = time_before_event
|
|
||||||
extracted_ers['shift_direction'] = -1
|
extracted_ers['shift_direction'] = -1
|
||||||
|
|
||||||
print(extracted_ers[['session_end_timestamp', 'event_timestamp']])
|
|
||||||
|
|
||||||
extracted_ers['se_duration'] = \
|
extracted_ers['se_duration'] = \
|
||||||
np.where(extracted_ers['se_duration'] == "1 - Še vedno traja",
|
np.where(
|
||||||
|
extracted_ers['se_duration'] == "1 - Še vedno traja",
|
||||||
extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'],
|
extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'],
|
||||||
extracted_ers['se_duration'])
|
extracted_ers['se_duration']
|
||||||
|
)
|
||||||
|
|
||||||
extracted_ers['se_duration'] = \
|
extracted_ers['se_duration'] = \
|
||||||
extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else (pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60)
|
extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else (pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60) + time_before_event
|
||||||
|
|
||||||
sys.exit()
|
extracted_ers['shift'] = format_timestamp(time_before_event)
|
||||||
# VV Testiranje različnih povpraševanj za VV
|
extracted_ers['length'] = extracted_ers['se_duration'].apply(lambda x: format_timestamp(x))
|
||||||
filter_esm = esm_df[(esm_df.esm_type == 7) & ((esm_df.questionnaire_id == 90.) | (esm_df.questionnaire_id == 91.))][['questionnaire_id', 'esm_user_answer', 'esm_session']]
|
|
||||||
print(filter_esm[filter_esm.esm_user_answer == "1 - Še vedno traja"].shape)
|
|
||||||
print(filter_esm.shape)
|
|
||||||
|
|
||||||
# TODO: generiranje stress_events_targets datoteke (dodaj tudi stolpec s pid) + dodati moraš merge metodo, ki bo združila te datoteke
|
extracted_ers[["label", "intensity"]].to_csv(snakemake.output[1], index=False)
|
||||||
# TODO: na koncu se mora v čistilni skripti ustrezno odstraniti vse targete in prilepiti nove targete zraven ustreznih segmentov (zna se zgoditi, da bodo overlap)
|
|
||||||
|
|
||||||
pd.DataFrame().to_csv(snakemake.output[1])
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise Exception("Please select correct target method for the event-related segments.")
|
raise Exception("Please select correct target method for the event-related segments.")
|
||||||
|
@ -129,19 +114,23 @@ def extract_ers_from_file(esm_df, device_id):
|
||||||
|
|
||||||
return extracted_ers[["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"]]
|
return extracted_ers[["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"]]
|
||||||
|
|
||||||
|
|
||||||
|
# Actual code execution
|
||||||
if snakemake.params["stage"] == "extract":
|
if snakemake.params["stage"] == "extract":
|
||||||
esm_df = pd.read_csv(input_data_files['esm_raw_input'])
|
esm_df = pd.read_csv(input_data_files['esm_raw_input'])
|
||||||
|
|
||||||
with open(input_data_files['pid_file'], 'r') as stream:
|
with open(input_data_files['pid_file'], 'r') as stream:
|
||||||
pid_file = yaml.load(stream, Loader=yaml.FullLoader)
|
pid_file = yaml.load(stream, Loader=yaml.FullLoader)
|
||||||
|
|
||||||
extracted_ers = extract_ers_from_file(esm_df, pid_file["PHONE"]["DEVICE_IDS"][0])
|
extracted_ers = extract_ers(esm_df, pid_file["PHONE"]["DEVICE_IDS"][0])
|
||||||
|
|
||||||
extracted_ers.to_csv(snakemake.output[0], index=False)
|
extracted_ers.to_csv(snakemake.output[0], index=False)
|
||||||
|
|
||||||
elif snakemake.params["stage"] == "merge":
|
elif snakemake.params["stage"] == "merge":
|
||||||
|
|
||||||
input_data_files = dict(snakemake.input)
|
input_data_files = dict(snakemake.input)
|
||||||
straw_events = pd.DataFrame(columns=["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"])
|
straw_events = pd.DataFrame(columns=["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"])
|
||||||
|
stress_events_targets = pd.DataFrame(columns=["label", "intensity"])
|
||||||
|
|
||||||
for input_file in input_data_files["ers_files"]:
|
for input_file in input_data_files["ers_files"]:
|
||||||
ers_df = pd.read_csv(input_file)
|
ers_df = pd.read_csv(input_file)
|
||||||
|
@ -149,3 +138,11 @@ elif snakemake.params["stage"] == "merge":
|
||||||
|
|
||||||
straw_events.to_csv(snakemake.output[0], index=False)
|
straw_events.to_csv(snakemake.output[0], index=False)
|
||||||
|
|
||||||
|
for input_file in input_data_files["se_files"]:
|
||||||
|
se_df = pd.read_csv(input_file)
|
||||||
|
stress_events_targets = pd.concat([stress_events_targets, se_df], axis=0, ignore_index=True)
|
||||||
|
|
||||||
|
stress_events_targets.to_csv(snakemake.output[1], index=False)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue