Compare commits

..

No commits in common. "8a6b52a97c95dcd8b70b980b4f46421b1a847905" and "6ebe83e47ea4da0066f4cd9dedbdb726cef5d06c" have entirely different histories.

8 changed files with 47 additions and 148 deletions

BIN
NaN.png 100644

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

View File

@ -26,7 +26,7 @@ TIME_SEGMENTS: &time_segments
INCLUDE_PAST_PERIODIC_SEGMENTS: TRUE # Only relevant if TYPE=PERIODIC, see docs INCLUDE_PAST_PERIODIC_SEGMENTS: TRUE # Only relevant if TYPE=PERIODIC, see docs
TAILORED_EVENTS: # Only relevant if TYPE=EVENT TAILORED_EVENTS: # Only relevant if TYPE=EVENT
COMPUTE: True COMPUTE: True
SEGMENTING_METHOD: "30_before" # 30_before, 90_before, stress_event TARGETS_METHOD: "stress_event" # 30_before, 90_before, stress_event
# See https://www.rapids.science/latest/setup/configuration/#timezone-of-your-study # See https://www.rapids.science/latest/setup/configuration/#timezone-of-your-study
TIMEZONE: TIMEZONE:
@ -242,7 +242,7 @@ PHONE_ESM:
STRAW: STRAW:
COMPUTE: True COMPUTE: True
SCALES: ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support", SCALES: ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support",
"appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"] "appraisal_stressfulness_period", "appraisal_stressfulness_event"]
FEATURES: [mean] FEATURES: [mean]
SRC_SCRIPT: src/features/phone_esm/straw/main.py SRC_SCRIPT: src/features/phone_esm/straw/main.py
@ -710,7 +710,7 @@ ALL_CLEANING_OVERALL:
COMPUTE: True COMPUTE: True
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5 MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
CORR_THRESHOLD: 0.95 CORR_THRESHOLD: 0.95
STANDARDIZATION: False STANDARDIZATION: True
SRC_SCRIPT: src/features/all_cleaning_overall/straw/main.py SRC_SCRIPT: src/features/all_cleaning_overall/straw/main.py
@ -732,8 +732,7 @@ PARAMS_FOR_ANALYSIS:
TARGET: TARGET:
COMPUTE: True COMPUTE: True
LABEL: appraisal_stressfulness_event_mean LABEL: PANAS_negative_affect_mean
ALL_LABELS: [PANAS_positive_affect_mean, PANAS_negative_affect_mean, JCQ_job_demand_mean, JCQ_job_control_mean, JCQ_supervisor_support_mean, ALL_LABELS: [appraisal_stressfulness_event_mean]
JCQ_coworker_support_mean, appraisal_stressfulness_period_mean, appraisal_stressfulness_event_mean, appraisal_threat_mean, appraisal_challenge_mean]
# PANAS_positive_affect_mean, PANAS_negative_affect_mean, JCQ_job_demand_mean, JCQ_job_control_mean, JCQ_supervisor_support_mean, # PANAS_positive_affect_mean, PANAS_negative_affect_mean, JCQ_job_demand_mean, JCQ_job_control_mean, JCQ_supervisor_support_mean,
# JCQ_coworker_support_mean, appraisal_stressfulness_period_mean, appraisal_stressfulness_event_mean, appraisal_threat_mean, appraisal_challenge_mean # JCQ_coworker_support_mean, appraisal_stressfulness_period_mean, appraisal_stressfulness_event_mean

View File

@ -27,10 +27,7 @@ def straw_cleaning(sensor_data_files, provider):
# (1) FILTER_OUT THE ROWS THAT DO NOT HAVE THE TARGET COLUMN AVAILABLE # (1) FILTER_OUT THE ROWS THAT DO NOT HAVE THE TARGET COLUMN AVAILABLE
if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']: if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']:
target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config
if 'phone_esm_straw_' + target in features:
features = features[features['phone_esm_straw_' + target].notna()].reset_index(drop=True) features = features[features['phone_esm_straw_' + target].notna()].reset_index(drop=True)
else:
return features
# (2.1) QUALITY CHECK (DATA YIELD COLUMN) deletes the rows where E4 or phone data is low quality # (2.1) QUALITY CHECK (DATA YIELD COLUMN) deletes the rows where E4 or phone data is low quality
phone_data_yield_unit = provider["PHONE_DATA_YIELD_FEATURE"].split("_")[3].lower() phone_data_yield_unit = provider["PHONE_DATA_YIELD_FEATURE"].split("_")[3].lower()
@ -141,6 +138,8 @@ def straw_cleaning(sensor_data_files, provider):
if esm not in features: if esm not in features:
features[esm] = esm_cols[esm] features[esm] = esm_cols[esm]
fe6 = features.copy()
# (9) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME # (9) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME
if features.isna().any().any(): if features.isna().any().any():
raise ValueError("There are still some NaNs present in the dataframe. Please check for implementation errors.") raise ValueError("There are still some NaNs present in the dataframe. Please check for implementation errors.")

View File

@ -22,29 +22,14 @@ def straw_cleaning(sensor_data_files, provider, target):
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime'] excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
graph_bf_af(features, "1target_rows_before") graph_bf_af(features, "1target_rows_before")
# (1.0) OVERRIDE STRESSFULNESS EVENT TARGETS IF ERS TARGETS_METHOD IS "STRESS_EVENT"
# (1.0) OVERRIDE STRESSFULNESS EVENT TARGETS IF ERS SEGMENTING_METHOD IS "STRESS_EVENT" if config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["TARGETS_METHOD"] == "stress_event" and \
if config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"] == "stress_event": "appraisal_stressfulness_event_mean" in config['PARAMS_FOR_ANALYSIS']['TARGET']['ALL_LABELS']:
stress_events_targets = pd.read_csv("data/external/stress_event_targets.csv") stress_events_targets = pd.read_csv("data/external/stress_event_targets.csv")
if "appraisal_stressfulness_event_mean" in config['PARAMS_FOR_ANALYSIS']['TARGET']['ALL_LABELS']:
features.drop(columns=['phone_esm_straw_appraisal_stressfulness_event_mean'], inplace=True) features.drop(columns=['phone_esm_straw_appraisal_stressfulness_event_mean'], inplace=True)
features = features.merge(stress_events_targets[["label", "appraisal_stressfulness_event"]] \ features = features.merge(stress_events_targets.rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \
.rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \ .rename(columns={'intensity': 'phone_esm_straw_appraisal_stressfulness_event_mean'})
.rename(columns={'appraisal_stressfulness_event': 'phone_esm_straw_appraisal_stressfulness_event_mean'})
if "appraisal_threat_mean" in config['PARAMS_FOR_ANALYSIS']['TARGET']['ALL_LABELS']:
features.drop(columns=['phone_esm_straw_appraisal_threat_mean'], inplace=True)
features = features.merge(stress_events_targets[["label", "appraisal_threat"]] \
.rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \
.rename(columns={'appraisal_threat': 'phone_esm_straw_appraisal_threat_mean'})
if "appraisal_challenge_mean" in config['PARAMS_FOR_ANALYSIS']['TARGET']['ALL_LABELS']:
features.drop(columns=['phone_esm_straw_appraisal_challenge_mean'], inplace=True)
features = features.merge(stress_events_targets[["label", "appraisal_challenge"]] \
.rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \
.rename(columns={'appraisal_challenge': 'phone_esm_straw_appraisal_challenge_mean'})
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns
@ -108,7 +93,7 @@ def straw_cleaning(sensor_data_files, provider, target):
features[impute_w_sn2] = features[impute_w_sn2].fillna(1) # Special case of imputation - nominal/ordinal value features[impute_w_sn2] = features[impute_w_sn2].fillna(1) # Special case of imputation - nominal/ordinal value
impute_w_sn3 = [col for col in features.columns if "loglocationvariance" in col] impute_w_sn3 = [col for col in features.columns if "loglocationvariance" in col]
features[impute_w_sn3] = features[impute_w_sn3].fillna(-1000000) # Special case of imputation - loglocation features[impute_w_sn2] = features[impute_w_sn2].fillna(-1000000) # Special case of imputation - loglocation
# Impute location features # Impute location features
impute_locations = [col for col in features \ impute_locations = [col for col in features \
@ -218,16 +203,6 @@ def straw_cleaning(sensor_data_files, provider, target):
graph_bf_af(features, "10correlation_drop") graph_bf_af(features, "10correlation_drop")
# Transform categorical columns to category dtype
cat1 = [col for col in features.columns if "mostcommonactivity" in col]
if cat1: # Transform columns to category dtype (mostcommonactivity)
features[cat1] = features[cat1].astype(int).astype('category')
cat2 = [col for col in features.columns if "homelabel" in col]
if cat2: # Transform columns to category dtype (homelabel)
features[cat2] = features[cat2].astype(int).astype('category')
# (10) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME # (10) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME
if features.isna().any().any(): if features.isna().any().any():
raise ValueError("There are still some NaNs present in the dataframe. Please check for implementation errors.") raise ValueError("There are still some NaNs present in the dataframe. Please check for implementation errors.")
@ -248,7 +223,7 @@ def impute(df, method='zero'):
'knn': k_nearest(df) 'knn': k_nearest(df)
}[method] }[method]
def graph_bf_af(features, phase_name, plt_flag=False): def graph_bf_af(features, phase_name, plt_flag=True):
if plt_flag: if plt_flag:
sns.set(rc={"figure.figsize":(16, 8)}) sns.set(rc={"figure.figsize":(16, 8)})
sns.heatmap(features.isna(), cbar=False) #features.select_dtypes(include=np.number) sns.heatmap(features.isna(), cbar=False) #features.select_dtypes(include=np.number)

View File

@ -42,8 +42,7 @@ def straw_features(sensor_data_files, time_segment, provider, filter_data_by_seg
requested_features = provider["FEATURES"] requested_features = provider["FEATURES"]
# name of the features this function can compute # name of the features this function can compute
requested_scales = provider["SCALES"] requested_scales = provider["SCALES"]
base_features_names = ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support", base_features_names = ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support", "appraisal_stressfulness_period", "appraisal_stressfulness_event"]
"appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"]
#TODO Check valid questionnaire and feature names. #TODO Check valid questionnaire and feature names.
# the subset of requested features this function can compute # the subset of requested features this function can compute
features_to_compute = list(set(requested_features) & set(base_features_names)) features_to_compute = list(set(requested_features) & set(base_features_names))

View File

@ -10,15 +10,6 @@ from esm import classify_sessions_by_completion_time, preprocess_esm
input_data_files = dict(snakemake.input) input_data_files = dict(snakemake.input)
def format_timestamp(x): def format_timestamp(x):
"""This method formates inputed timestamp into format "HH MM SS". Including spaces. If there is no hours or minutes present
that part is ignored, e.g., "MM SS" or just "SS".
Args:
x (int): unix timestamp in seconds
Returns:
str: formatted timestamp using "HH MM SS" sintax
"""
tstring="" tstring=""
space = False space = False
if x//3600 > 0: if x//3600 > 0:
@ -32,23 +23,8 @@ def format_timestamp(x):
return tstring return tstring
def extract_ers(esm_df): def extract_ers(esm_df, device_id):
"""This method has two major functionalities:
(1) It prepares STRAW event-related segments file with the use of esm file. The execution protocol is depended on
the segmenting method specified in the config.yaml file.
(2) It prepares and writes csv with targets and corresponding time segments labels. This is later used
in the overall cleaning script (straw).
Details about each segmenting method are listed below by each corresponding condition. Refer to the RAPIDS documentation for the
ERS file format: https://www.rapids.science/1.9/setup/configuration/#time-segments -> event segments
Args:
esm_df (DataFrame): read esm file that is dependend on the current participant.
Returns:
extracted_ers (DataFrame): dataframe with all necessary information to write event-related segments file
in the correct format.
"""
pd.set_option("display.max_rows", 20) pd.set_option("display.max_rows", 20)
pd.set_option("display.max_columns", None) pd.set_option("display.max_columns", None)
@ -64,34 +40,23 @@ def extract_ers(esm_df):
esm_filtered_sessions = classified[classified["session_response"] == 'ema_completed'].reset_index()[['device_id', 'esm_session']] esm_filtered_sessions = classified[classified["session_response"] == 'ema_completed'].reset_index()[['device_id', 'esm_session']]
esm_df = esm_preprocessed.loc[(esm_preprocessed['device_id'].isin(esm_filtered_sessions['device_id'])) & (esm_preprocessed['esm_session'].isin(esm_filtered_sessions['esm_session']))] esm_df = esm_preprocessed.loc[(esm_preprocessed['device_id'].isin(esm_filtered_sessions['device_id'])) & (esm_preprocessed['esm_session'].isin(esm_filtered_sessions['esm_session']))]
segmenting_method = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"] targets_method = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["TARGETS_METHOD"]
if targets_method in ["30_before", "90_before"]: # takes 30-minute peroid before the questionnaire + the duration of the questionnaire
if segmenting_method in ["30_before", "90_before"]: # takes 30-minute peroid before the questionnaire + the duration of the questionnaire
""" '30-minutes and 90-minutes before' have the same fundamental logic with couple of deviations that will be explained below.
Both take x-minute period before the questionnaire that is summed with the questionnaire duration.
All questionnaire durations over 15 minutes are excluded from the querying.
"""
# Extract time-relevant information # Extract time-relevant information
extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index() # questionnaire length extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index() # questionnaire length
extracted_ers["label"] = f"straw_event_{segmenting_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3) extracted_ers["label"] = f"straw_event_{targets_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
extracted_ers[['event_timestamp', 'device_id']] = esm_df.groupby(["device_id", "esm_session"])['timestamp'].min().reset_index()[['timestamp', 'device_id']] extracted_ers[['event_timestamp', 'device_id']] = esm_df.groupby(["device_id", "esm_session"])['timestamp'].min().reset_index()[['timestamp', 'device_id']]
extracted_ers = extracted_ers[extracted_ers["timestamp"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire anwsering is 15 min extracted_ers = extracted_ers[extracted_ers["timestamp"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire anwsering is 15 min
extracted_ers["shift_direction"] = -1 extracted_ers["shift_direction"] = -1
if segmenting_method == "30_before": if targets_method == "30_before":
"""The method 30-minutes before simply takes 30 minutes before the questionnaire and sums it with the questionnaire duration.
The timestamps are formatted with the help of format_timestamp() method.
"""
time_before_questionnaire = 30 * 60 # in seconds (30 minutes) time_before_questionnaire = 30 * 60 # in seconds (30 minutes)
extracted_ers["length"] = (extracted_ers["timestamp"] + time_before_questionnaire).apply(lambda x: format_timestamp(x)) extracted_ers["length"] = (extracted_ers["timestamp"] + time_before_questionnaire).apply(lambda x: format_timestamp(x))
extracted_ers["shift"] = time_before_questionnaire extracted_ers["shift"] = time_before_questionnaire
extracted_ers["shift"] = extracted_ers["shift"].apply(lambda x: format_timestamp(x)) extracted_ers["shift"] = extracted_ers["shift"].apply(lambda x: format_timestamp(x))
elif segmenting_method == "90_before": elif targets_method == "90_before":
"""The method 90-minutes before has an important condition. If the time between the current and the previous questionnaire is
longer then 90 minutes it takes 90 minutes, otherwise it takes the original time difference between the questionnaires.
"""
time_before_questionnaire = 90 * 60 # in seconds (90 minutes) time_before_questionnaire = 90 * 60 # in seconds (90 minutes)
extracted_ers[['end_event_timestamp', 'device_id']] = esm_df.groupby(["device_id", "esm_session"])['timestamp'].max().reset_index()[['timestamp', 'device_id']] extracted_ers[['end_event_timestamp', 'device_id']] = esm_df.groupby(["device_id", "esm_session"])['timestamp'].max().reset_index()[['timestamp', 'device_id']]
@ -104,75 +69,47 @@ def extract_ers(esm_df):
extracted_ers["length"] = (extracted_ers["timestamp"] + extracted_ers["diffs"]).apply(lambda x: format_timestamp(x)) extracted_ers["length"] = (extracted_ers["timestamp"] + extracted_ers["diffs"]).apply(lambda x: format_timestamp(x))
extracted_ers["shift"] = extracted_ers["diffs"].apply(lambda x: format_timestamp(x)) extracted_ers["shift"] = extracted_ers["diffs"].apply(lambda x: format_timestamp(x))
elif segmenting_method == "stress_event": elif targets_method == "stress_event":
"""This is a special case of the method as it consists of two important parts:
(1) Generating of the ERS file (same as the methods above) and
(2) Generating targets file alongside with the correct time segment labels.
This extracts event-related segments, depended on the event time and duration specified by the participant in the next
questionnaire. Additionally, 5 minutes before the specified start time of this event is taken to take into a account the
possiblity of the participant not remembering the start time percisely => this parameter can be manipulated with the variable
"time_before_event" which is defined below.
By default, this method also excludes all events that are longer then 2.5 hours so that the segments are easily comparable.
"""
# Get and join required data # Get and join required data
extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index().rename(columns={'timestamp': 'session_length'}) # questionnaire end timestamp extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index().rename(columns={'timestamp': 'session_length'}) # questionnaire end timestamp
extracted_ers = extracted_ers[extracted_ers["session_length"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire anwsering is 15 min extracted_ers = extracted_ers[extracted_ers["session_length"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire anwsering is 15 min
session_end_timestamp = esm_df.groupby(['device_id', 'esm_session'])['timestamp'].max().to_frame().rename(columns={'timestamp': 'session_end_timestamp'}) # questionnaire end timestamp session_end_timestamp = esm_df.groupby(['device_id', 'esm_session'])['timestamp'].max().to_frame().rename(columns={'timestamp': 'session_end_timestamp'}) # questionnaire end timestamp
se_time = esm_df[esm_df.questionnaire_id == 90.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_time'}) se_time = esm_df[esm_df.questionnaire_id == 90.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_time'})
se_duration = esm_df[esm_df.questionnaire_id == 91.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_duration'}) se_duration = esm_df[esm_df.questionnaire_id == 91.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_duration'})
se_intensity = esm_df[esm_df.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'intensity'})
# Extracted 3 targets that will be transfered with the csv file to the cleaning script.
se_stressfulness_event_tg = esm_df[esm_df.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_stressfulness_event'})
se_threat_tg = esm_df[esm_df.questionnaire_id == 88.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_threat'})
se_challenge_tg = esm_df[esm_df.questionnaire_id == 89.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_challenge'})
# All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count)
extracted_ers = extracted_ers.join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \ extracted_ers = extracted_ers.join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \
.join(se_time, on=['device_id', 'esm_session'], how='inner') \ .join(se_time, on=['device_id', 'esm_session'], how='inner') \
.join(se_duration, on=['device_id', 'esm_session'], how='inner') \ .join(se_duration, on=['device_id', 'esm_session'], how='inner') \
.join(se_stressfulness_event_tg, on=['device_id', 'esm_session'], how='inner') \ .join(se_intensity, on=['device_id', 'esm_session'], how='inner')
.join(se_threat_tg, on=['device_id', 'esm_session'], how='inner') \
.join(se_challenge_tg, on=['device_id', 'esm_session'], how='inner')
# Filter sessions that are not useful
# Filter sessions that are not useful. Because of the ambiguity this excludes: extracted_ers = extracted_ers[(extracted_ers.se_time != "0 - Ne spomnim se") & (extracted_ers.se_duration != "0 - Ne spomnim se")]
# (1) straw event times that are marked as "0 - I don't remember"
# (2) straw event durations that are marked as "0 - I don't remember"
extracted_ers = extracted_ers[(~extracted_ers.se_time.str.startswith("0 - ")) & (~extracted_ers.se_duration.str.startswith("0 - "))]
# Transform data into its final form, ready for the extraction # Transform data into its final form, ready for the extraction
extracted_ers.reset_index(drop=True, inplace=True) extracted_ers.reset_index(inplace=True)
time_before_event = 5 * 60 # in seconds (5 minutes) time_before_event = 5 * 60 # in seconds (5 minutes)
extracted_ers['event_timestamp'] = pd.to_datetime(extracted_ers['se_time']).apply(lambda x: x.timestamp() * 1000).astype('int64') extracted_ers['event_timestamp'] = pd.to_datetime(extracted_ers['se_time']).apply(lambda x: x.timestamp() * 1000).astype('int64')
extracted_ers['shift_direction'] = -1 extracted_ers['shift_direction'] = -1
# Checks whether the duration is marked with "1 - It's still ongoing" which means that the end of the current questionnaire
# is taken as end time of the segment. Else the user input duration is taken.
extracted_ers['se_duration'] = \ extracted_ers['se_duration'] = \
np.where( np.where(
extracted_ers['se_duration'].str.startswith("1 - "), extracted_ers['se_duration'] == "1 - Še vedno traja",
extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'], extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'],
extracted_ers['se_duration'] extracted_ers['se_duration']
) )
# This converts the rows of timestamps in miliseconds and the row with datetime to timestamp in seconds.
extracted_ers['se_duration'] = \ extracted_ers['se_duration'] = \
extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else (pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60) + time_before_event extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else (pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60) + time_before_event
extracted_ers = extracted_ers[extracted_ers["se_duration"] <= 2.5 * 60 * 60].reset_index(drop=True) # Exclude events that are longer than 2.5 hours
extracted_ers["label"] = f"straw_event_{targets_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
extracted_ers['shift'] = format_timestamp(time_before_event) extracted_ers['shift'] = format_timestamp(time_before_event)
extracted_ers['length'] = extracted_ers['se_duration'].apply(lambda x: format_timestamp(x)) extracted_ers['length'] = extracted_ers['se_duration'].apply(lambda x: format_timestamp(x))
# Drop event_timestamp duplicates in case of user referencing the same event over multiple questionnaires extracted_ers[["label", "intensity"]].to_csv(snakemake.output[1], index=False)
extracted_ers.drop_duplicates(subset=["event_timestamp"], keep='first', inplace=True)
extracted_ers.reset_index(drop=True, inplace=True)
extracted_ers["label"] = f"straw_event_{segmenting_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
# Write the csv of extracted ERS labels with targets related to stressfulness event
extracted_ers[["label", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"]].to_csv(snakemake.output[1], index=False)
else: else:
raise Exception("Please select correct target method for the event-related segments.") raise Exception("Please select correct target method for the event-related segments.")
@ -181,20 +118,14 @@ def extract_ers(esm_df):
return extracted_ers[["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"]] return extracted_ers[["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"]]
""" # Actual code execution
Here the code is executed - this .py file is used both for extraction of the STRAW time_segments file for the individual
participant, and also for merging all participant's files into one combined file which is later used for the time segments
to all sensors assignment.
There are two files involved (see rules extract_event_information_from_esm and merge_event_related_segments_files in preprocessing.smk)
(1) ERS file which contains all the information about the time segment timings and
(2) targets file which has corresponding target value for the segment label which is later used to merge with other features in the cleaning script.
For more information, see the comment in the method above.
"""
if snakemake.params["stage"] == "extract": if snakemake.params["stage"] == "extract":
esm_df = pd.read_csv(input_data_files['esm_raw_input']) esm_df = pd.read_csv(input_data_files['esm_raw_input'])
extracted_ers = extract_ers(esm_df) with open(input_data_files['pid_file'], 'r') as stream:
pid_file = yaml.load(stream, Loader=yaml.FullLoader)
extracted_ers = extract_ers(esm_df, pid_file["PHONE"]["DEVICE_IDS"][0])
extracted_ers.to_csv(snakemake.output[0], index=False) extracted_ers.to_csv(snakemake.output[0], index=False)
@ -202,7 +133,7 @@ elif snakemake.params["stage"] == "merge":
input_data_files = dict(snakemake.input) input_data_files = dict(snakemake.input)
straw_events = pd.DataFrame(columns=["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"]) straw_events = pd.DataFrame(columns=["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"])
stress_events_targets = pd.DataFrame(columns=["label", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"]) stress_events_targets = pd.DataFrame(columns=["label", "intensity"])
for input_file in input_data_files["ers_files"]: for input_file in input_data_files["ers_files"]:
ers_df = pd.read_csv(input_file) ers_df = pd.read_csv(input_file)

View File

@ -1,6 +1,5 @@
import pandas as pd import pandas as pd
import sys
import warnings
def retain_target_column(df_input: pd.DataFrame, target_variable_name: str): def retain_target_column(df_input: pd.DataFrame, target_variable_name: str):
column_names = df_input.columns column_names = df_input.columns
@ -9,9 +8,9 @@ def retain_target_column(df_input: pd.DataFrame, target_variable_name: str):
esm_names = column_names[esm_names_index] esm_names = column_names[esm_names_index]
target_variable_index = esm_names.str.contains(target_variable_name) target_variable_index = esm_names.str.contains(target_variable_name)
if all(~target_variable_index): if all(~target_variable_index):
warnings.warn(f"The requested target (, {target_variable_name} ,)cannot be found in the dataset. Please check the names of phone_esm_ columns in cleaned python file") raise ValueError("The requested target (", target_variable_name,
return None ")cannot be found in the dataset.",
"Please check the names of phone_esm_ columns in z_all_sensor_features_cleaned_straw_py.csv")
sensor_features_plus_target = df_input.drop(esm_names, axis=1) sensor_features_plus_target = df_input.drop(esm_names, axis=1)
sensor_features_plus_target["target"] = df_input[esm_names[target_variable_index]] sensor_features_plus_target["target"] = df_input[esm_names[target_variable_index]]
# We will only keep one column related to phone_esm and that will be our target variable. # We will only keep one column related to phone_esm and that will be our target variable.

View File

@ -7,7 +7,4 @@ target_variable_name = snakemake.params["target_variable"]
model_input = retain_target_column(cleaned_sensor_features, target_variable_name) model_input = retain_target_column(cleaned_sensor_features, target_variable_name)
if model_input is None: model_input.to_csv(snakemake.output[0], index=False)
pd.DataFrame().to_csv(snakemake.output[0])
else:
model_input.to_csv(snakemake.output[0], index=False)