Fix some bugs and extend ERS and cleaning scripts with multiple stress event targets logic.

imputation_and_cleaning
Primoz 2022-11-15 11:21:51 +00:00
parent ab803ee49c
commit 286de93bfd
3 changed files with 28 additions and 16 deletions

View File

@ -22,25 +22,29 @@ def straw_cleaning(sensor_data_files, provider, target):
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime'] excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
graph_bf_af(features, "1target_rows_before") graph_bf_af(features, "1target_rows_before")
# (1.0) OVERRIDE STRESSFULNESS EVENT TARGETS IF ERS TARGETS_METHOD IS "STRESS_EVENT"
if config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["TARGETS_METHOD"] == "stress_event": # (1.0) OVERRIDE STRESSFULNESS EVENT TARGETS IF ERS SEGMENTING_METHOD IS "STRESS_EVENT"
if config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"] == "stress_event":
stress_events_targets = pd.read_csv("data/external/stress_event_targets.csv") stress_events_targets = pd.read_csv("data/external/stress_event_targets.csv")
if "appraisal_stressfulness_event_mean" in config['PARAMS_FOR_ANALYSIS']['TARGET']['ALL_LABELS']: if "appraisal_stressfulness_event_mean" in config['PARAMS_FOR_ANALYSIS']['TARGET']['ALL_LABELS']:
features.drop(columns=['phone_esm_straw_appraisal_stressfulness_event_mean'], inplace=True) features.drop(columns=['phone_esm_straw_appraisal_stressfulness_event_mean'], inplace=True)
features = features.merge(stress_events_targets.rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \ features = features.merge(stress_events_targets[["label", "appraisal_stressfulness_event"]] \
.rename(columns={'appraisal_stressfulness_event': 'phone_esm_straw_appraisal_stressfulness_event_mean'}) .rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \
.rename(columns={'appraisal_stressfulness_event': 'phone_esm_straw_appraisal_stressfulness_event_mean'})
if "appraisal_threat_mean" in config['PARAMS_FOR_ANALYSIS']['TARGET']['ALL_LABELS']: if "appraisal_threat_mean" in config['PARAMS_FOR_ANALYSIS']['TARGET']['ALL_LABELS']:
features.drop(columns=['phone_esm_straw_appraisal_threat_mean'], inplace=True) features.drop(columns=['phone_esm_straw_appraisal_threat_mean'], inplace=True)
features = features.merge(stress_events_targets.rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \ features = features.merge(stress_events_targets[["label", "appraisal_threat"]] \
.rename(columns={'appraisal_threat_mean': 'phone_esm_straw_appraisal_threat_mean'}) .rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \
.rename(columns={'appraisal_threat': 'phone_esm_straw_appraisal_threat_mean'})
if "appraisal_challenge_mean" in config['PARAMS_FOR_ANALYSIS']['TARGET']['ALL_LABELS']: if "appraisal_challenge_mean" in config['PARAMS_FOR_ANALYSIS']['TARGET']['ALL_LABELS']:
features.drop(columns=['phone_esm_straw_appraisal_challenge_mean'], inplace=True) features.drop(columns=['phone_esm_straw_appraisal_challenge_mean'], inplace=True)
features = features.merge(stress_events_targets.rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \ features = features.merge(stress_events_targets[["label", "appraisal_challenge"]] \
.rename(columns={'appraisal_challenge': 'phone_esm_straw_appraisal_challenge_mean'}) .rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \
.rename(columns={'appraisal_challenge': 'phone_esm_straw_appraisal_challenge_mean'})
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns
@ -234,7 +238,7 @@ def impute(df, method='zero'):
'knn': k_nearest(df) 'knn': k_nearest(df)
}[method] }[method]
def graph_bf_af(features, phase_name, plt_flag=True): def graph_bf_af(features, phase_name, plt_flag=False):
if plt_flag: if plt_flag:
sns.set(rc={"figure.figsize":(16, 8)}) sns.set(rc={"figure.figsize":(16, 8)})
sns.heatmap(features.isna(), cbar=False) #features.select_dtypes(include=np.number) sns.heatmap(features.isna(), cbar=False) #features.select_dtypes(include=np.number)

View File

@ -42,7 +42,7 @@ def straw_features(sensor_data_files, time_segment, provider, filter_data_by_seg
requested_features = provider["FEATURES"] requested_features = provider["FEATURES"]
# name of the features this function can compute # name of the features this function can compute
requested_scales = provider["SCALES"] requested_scales = provider["SCALES"]
base_features_names = ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support", \ base_features_names = ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support",
"appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"] "appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"]
#TODO Check valid questionnaire and feature names. #TODO Check valid questionnaire and feature names.
# the subset of requested features this function can compute # the subset of requested features this function can compute

View File

@ -122,13 +122,21 @@ def extract_ers(esm_df):
session_end_timestamp = esm_df.groupby(['device_id', 'esm_session'])['timestamp'].max().to_frame().rename(columns={'timestamp': 'session_end_timestamp'}) # questionnaire end timestamp session_end_timestamp = esm_df.groupby(['device_id', 'esm_session'])['timestamp'].max().to_frame().rename(columns={'timestamp': 'session_end_timestamp'}) # questionnaire end timestamp
se_time = esm_df[esm_df.questionnaire_id == 90.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_time'}) se_time = esm_df[esm_df.questionnaire_id == 90.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_time'})
se_duration = esm_df[esm_df.questionnaire_id == 91.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_duration'}) se_duration = esm_df[esm_df.questionnaire_id == 91.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_duration'})
se_intensity = esm_df[esm_df.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'intensity'})
# Extracted 3 targets that will be transfered with the csv file to the cleaning script.
se_stressfulness_event_tg = esm_df[esm_df.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_stressfulness_event'})
se_threat_tg = esm_df[esm_df.questionnaire_id == 88.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_threat'})
se_challenge_tg = esm_df[esm_df.questionnaire_id == 89.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_challenge'})
# All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count)
extracted_ers = extracted_ers.join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \ extracted_ers = extracted_ers.join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \
.join(se_time, on=['device_id', 'esm_session'], how='inner') \ .join(se_time, on=['device_id', 'esm_session'], how='inner') \
.join(se_duration, on=['device_id', 'esm_session'], how='inner') \ .join(se_duration, on=['device_id', 'esm_session'], how='inner') \
.join(se_intensity, on=['device_id', 'esm_session'], how='inner') .join(se_stressfulness_event_tg, on=['device_id', 'esm_session'], how='inner') \
.join(se_threat_tg, on=['device_id', 'esm_session'], how='inner') \
.join(se_challenge_tg, on=['device_id', 'esm_session'], how='inner')
# Filter sessions that are not useful. Because of the ambiguity this excludes: # Filter sessions that are not useful. Because of the ambiguity this excludes:
# (1) straw event times that are marked as "0 - I don't remember" # (1) straw event times that are marked as "0 - I don't remember"
# (2) straw event durations that are marked as "0 - I don't remember" # (2) straw event durations that are marked as "0 - I don't remember"
@ -163,8 +171,8 @@ def extract_ers(esm_df):
extracted_ers["label"] = f"straw_event_{segmenting_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3) extracted_ers["label"] = f"straw_event_{segmenting_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
# Write the csv of extracted ERS labels with targets (stress event intensity) # Write the csv of extracted ERS labels with targets related to stressfulness event
extracted_ers[["label", "intensity"]].to_csv(snakemake.output[1], index=False) extracted_ers[["label", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"]].to_csv(snakemake.output[1], index=False)
else: else:
raise Exception("Please select correct target method for the event-related segments.") raise Exception("Please select correct target method for the event-related segments.")
@ -194,7 +202,7 @@ elif snakemake.params["stage"] == "merge":
input_data_files = dict(snakemake.input) input_data_files = dict(snakemake.input)
straw_events = pd.DataFrame(columns=["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"]) straw_events = pd.DataFrame(columns=["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"])
stress_events_targets = pd.DataFrame(columns=["label", "intensity"]) stress_events_targets = pd.DataFrame(columns=["label", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"])
for input_file in input_data_files["ers_files"]: for input_file in input_data_files["ers_files"]:
ers_df = pd.read_csv(input_file) ers_df = pd.read_csv(input_file)