diff --git a/src/features/all_cleaning_overall/straw/main.py b/src/features/all_cleaning_overall/straw/main.py index 7662440f..fb7b9344 100644 --- a/src/features/all_cleaning_overall/straw/main.py +++ b/src/features/all_cleaning_overall/straw/main.py @@ -22,25 +22,29 @@ def straw_cleaning(sensor_data_files, provider, target): excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime'] graph_bf_af(features, "1target_rows_before") - # (1.0) OVERRIDE STRESSFULNESS EVENT TARGETS IF ERS TARGETS_METHOD IS "STRESS_EVENT" - if config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["TARGETS_METHOD"] == "stress_event": + + # (1.0) OVERRIDE STRESSFULNESS EVENT TARGETS IF ERS SEGMENTING_METHOD IS "STRESS_EVENT" + if config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"] == "stress_event": stress_events_targets = pd.read_csv("data/external/stress_event_targets.csv") if "appraisal_stressfulness_event_mean" in config['PARAMS_FOR_ANALYSIS']['TARGET']['ALL_LABELS']: features.drop(columns=['phone_esm_straw_appraisal_stressfulness_event_mean'], inplace=True) - features = features.merge(stress_events_targets.rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \ - .rename(columns={'appraisal_stressfulness_event': 'phone_esm_straw_appraisal_stressfulness_event_mean'}) + features = features.merge(stress_events_targets[["label", "appraisal_stressfulness_event"]] \ + .rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \ + .rename(columns={'appraisal_stressfulness_event': 'phone_esm_straw_appraisal_stressfulness_event_mean'}) if "appraisal_threat_mean" in config['PARAMS_FOR_ANALYSIS']['TARGET']['ALL_LABELS']: features.drop(columns=['phone_esm_straw_appraisal_threat_mean'], inplace=True) - features = features.merge(stress_events_targets.rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \ - .rename(columns={'appraisal_threat_mean': 'phone_esm_straw_appraisal_threat_mean'}) + features = features.merge(stress_events_targets[["label", "appraisal_threat"]] \ + .rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \ + .rename(columns={'appraisal_threat': 'phone_esm_straw_appraisal_threat_mean'}) if "appraisal_challenge_mean" in config['PARAMS_FOR_ANALYSIS']['TARGET']['ALL_LABELS']: features.drop(columns=['phone_esm_straw_appraisal_challenge_mean'], inplace=True) - features = features.merge(stress_events_targets.rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \ - .rename(columns={'appraisal_challenge': 'phone_esm_straw_appraisal_challenge_mean'}) + features = features.merge(stress_events_targets[["label", "appraisal_challenge"]] \ + .rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \ + .rename(columns={'appraisal_challenge': 'phone_esm_straw_appraisal_challenge_mean'}) esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns @@ -234,7 +238,7 @@ def impute(df, method='zero'): 'knn': k_nearest(df) }[method] -def graph_bf_af(features, phase_name, plt_flag=True): +def graph_bf_af(features, phase_name, plt_flag=False): if plt_flag: sns.set(rc={"figure.figsize":(16, 8)}) sns.heatmap(features.isna(), cbar=False) #features.select_dtypes(include=np.number) diff --git a/src/features/phone_esm/straw/main.py b/src/features/phone_esm/straw/main.py index 70f09a30..8a55b8eb 100644 --- a/src/features/phone_esm/straw/main.py +++ b/src/features/phone_esm/straw/main.py @@ -42,7 +42,7 @@ def straw_features(sensor_data_files, time_segment, provider, filter_data_by_seg requested_features = provider["FEATURES"] # name of the features this function can compute requested_scales = provider["SCALES"] - base_features_names = ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support", \ + base_features_names = ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support", "appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"] #TODO Check valid questionnaire and feature names. # the subset of requested features this function can compute diff --git a/src/features/phone_esm/straw/process_user_event_related_segments.py b/src/features/phone_esm/straw/process_user_event_related_segments.py index a9c47370..353e714c 100644 --- a/src/features/phone_esm/straw/process_user_event_related_segments.py +++ b/src/features/phone_esm/straw/process_user_event_related_segments.py @@ -122,13 +122,21 @@ def extract_ers(esm_df): session_end_timestamp = esm_df.groupby(['device_id', 'esm_session'])['timestamp'].max().to_frame().rename(columns={'timestamp': 'session_end_timestamp'}) # questionnaire end timestamp se_time = esm_df[esm_df.questionnaire_id == 90.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_time'}) se_duration = esm_df[esm_df.questionnaire_id == 91.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_duration'}) - se_intensity = esm_df[esm_df.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'intensity'}) + + # Extracted 3 targets that will be transfered with the csv file to the cleaning script. + se_stressfulness_event_tg = esm_df[esm_df.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_stressfulness_event'}) + se_threat_tg = esm_df[esm_df.questionnaire_id == 88.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_threat'}) + se_challenge_tg = esm_df[esm_df.questionnaire_id == 89.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_challenge'}) + # All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count) extracted_ers = extracted_ers.join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \ .join(se_time, on=['device_id', 'esm_session'], how='inner') \ .join(se_duration, on=['device_id', 'esm_session'], how='inner') \ - .join(se_intensity, on=['device_id', 'esm_session'], how='inner') - + .join(se_stressfulness_event_tg, on=['device_id', 'esm_session'], how='inner') \ + .join(se_threat_tg, on=['device_id', 'esm_session'], how='inner') \ + .join(se_challenge_tg, on=['device_id', 'esm_session'], how='inner') + + # Filter sessions that are not useful. Because of the ambiguity this excludes: # (1) straw event times that are marked as "0 - I don't remember" # (2) straw event durations that are marked as "0 - I don't remember" @@ -163,8 +171,8 @@ def extract_ers(esm_df): extracted_ers["label"] = f"straw_event_{segmenting_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3) - # Write the csv of extracted ERS labels with targets (stress event intensity) - extracted_ers[["label", "intensity"]].to_csv(snakemake.output[1], index=False) + # Write the csv of extracted ERS labels with targets related to stressfulness event + extracted_ers[["label", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"]].to_csv(snakemake.output[1], index=False) else: raise Exception("Please select correct target method for the event-related segments.") @@ -194,7 +202,7 @@ elif snakemake.params["stage"] == "merge": input_data_files = dict(snakemake.input) straw_events = pd.DataFrame(columns=["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"]) - stress_events_targets = pd.DataFrame(columns=["label", "intensity"]) + stress_events_targets = pd.DataFrame(columns=["label", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"]) for input_file in input_data_files["ers_files"]: ers_df = pd.read_csv(input_file)