Compare commits
No commits in common. "8a6b52a97c95dcd8b70b980b4f46421b1a847905" and "6ebe83e47ea4da0066f4cd9dedbdb726cef5d06c" have entirely different histories.
8a6b52a97c
...
6ebe83e47e
13
config.yaml
13
config.yaml
|
@ -26,7 +26,7 @@ TIME_SEGMENTS: &time_segments
|
||||||
INCLUDE_PAST_PERIODIC_SEGMENTS: TRUE # Only relevant if TYPE=PERIODIC, see docs
|
INCLUDE_PAST_PERIODIC_SEGMENTS: TRUE # Only relevant if TYPE=PERIODIC, see docs
|
||||||
TAILORED_EVENTS: # Only relevant if TYPE=EVENT
|
TAILORED_EVENTS: # Only relevant if TYPE=EVENT
|
||||||
COMPUTE: True
|
COMPUTE: True
|
||||||
SEGMENTING_METHOD: "30_before" # 30_before, 90_before, stress_event
|
TARGETS_METHOD: "stress_event" # 30_before, 90_before, stress_event
|
||||||
|
|
||||||
# See https://www.rapids.science/latest/setup/configuration/#timezone-of-your-study
|
# See https://www.rapids.science/latest/setup/configuration/#timezone-of-your-study
|
||||||
TIMEZONE:
|
TIMEZONE:
|
||||||
|
@ -242,7 +242,7 @@ PHONE_ESM:
|
||||||
STRAW:
|
STRAW:
|
||||||
COMPUTE: True
|
COMPUTE: True
|
||||||
SCALES: ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support",
|
SCALES: ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support",
|
||||||
"appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"]
|
"appraisal_stressfulness_period", "appraisal_stressfulness_event"]
|
||||||
FEATURES: [mean]
|
FEATURES: [mean]
|
||||||
SRC_SCRIPT: src/features/phone_esm/straw/main.py
|
SRC_SCRIPT: src/features/phone_esm/straw/main.py
|
||||||
|
|
||||||
|
@ -710,7 +710,7 @@ ALL_CLEANING_OVERALL:
|
||||||
COMPUTE: True
|
COMPUTE: True
|
||||||
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||||
CORR_THRESHOLD: 0.95
|
CORR_THRESHOLD: 0.95
|
||||||
STANDARDIZATION: False
|
STANDARDIZATION: True
|
||||||
SRC_SCRIPT: src/features/all_cleaning_overall/straw/main.py
|
SRC_SCRIPT: src/features/all_cleaning_overall/straw/main.py
|
||||||
|
|
||||||
|
|
||||||
|
@ -732,8 +732,7 @@ PARAMS_FOR_ANALYSIS:
|
||||||
|
|
||||||
TARGET:
|
TARGET:
|
||||||
COMPUTE: True
|
COMPUTE: True
|
||||||
LABEL: appraisal_stressfulness_event_mean
|
LABEL: PANAS_negative_affect_mean
|
||||||
ALL_LABELS: [PANAS_positive_affect_mean, PANAS_negative_affect_mean, JCQ_job_demand_mean, JCQ_job_control_mean, JCQ_supervisor_support_mean,
|
ALL_LABELS: [appraisal_stressfulness_event_mean]
|
||||||
JCQ_coworker_support_mean, appraisal_stressfulness_period_mean, appraisal_stressfulness_event_mean, appraisal_threat_mean, appraisal_challenge_mean]
|
|
||||||
# PANAS_positive_affect_mean, PANAS_negative_affect_mean, JCQ_job_demand_mean, JCQ_job_control_mean, JCQ_supervisor_support_mean,
|
# PANAS_positive_affect_mean, PANAS_negative_affect_mean, JCQ_job_demand_mean, JCQ_job_control_mean, JCQ_supervisor_support_mean,
|
||||||
# JCQ_coworker_support_mean, appraisal_stressfulness_period_mean, appraisal_stressfulness_event_mean, appraisal_threat_mean, appraisal_challenge_mean
|
# JCQ_coworker_support_mean, appraisal_stressfulness_period_mean, appraisal_stressfulness_event_mean
|
||||||
|
|
|
@ -27,10 +27,7 @@ def straw_cleaning(sensor_data_files, provider):
|
||||||
# (1) FILTER_OUT THE ROWS THAT DO NOT HAVE THE TARGET COLUMN AVAILABLE
|
# (1) FILTER_OUT THE ROWS THAT DO NOT HAVE THE TARGET COLUMN AVAILABLE
|
||||||
if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']:
|
if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']:
|
||||||
target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config
|
target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config
|
||||||
if 'phone_esm_straw_' + target in features:
|
features = features[features['phone_esm_straw_' + target].notna()].reset_index(drop=True)
|
||||||
features = features[features['phone_esm_straw_' + target].notna()].reset_index(drop=True)
|
|
||||||
else:
|
|
||||||
return features
|
|
||||||
|
|
||||||
# (2.1) QUALITY CHECK (DATA YIELD COLUMN) deletes the rows where E4 or phone data is low quality
|
# (2.1) QUALITY CHECK (DATA YIELD COLUMN) deletes the rows where E4 or phone data is low quality
|
||||||
phone_data_yield_unit = provider["PHONE_DATA_YIELD_FEATURE"].split("_")[3].lower()
|
phone_data_yield_unit = provider["PHONE_DATA_YIELD_FEATURE"].split("_")[3].lower()
|
||||||
|
@ -141,6 +138,8 @@ def straw_cleaning(sensor_data_files, provider):
|
||||||
if esm not in features:
|
if esm not in features:
|
||||||
features[esm] = esm_cols[esm]
|
features[esm] = esm_cols[esm]
|
||||||
|
|
||||||
|
fe6 = features.copy()
|
||||||
|
|
||||||
# (9) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME
|
# (9) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME
|
||||||
if features.isna().any().any():
|
if features.isna().any().any():
|
||||||
raise ValueError("There are still some NaNs present in the dataframe. Please check for implementation errors.")
|
raise ValueError("There are still some NaNs present in the dataframe. Please check for implementation errors.")
|
||||||
|
|
|
@ -22,29 +22,14 @@ def straw_cleaning(sensor_data_files, provider, target):
|
||||||
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
|
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
|
||||||
|
|
||||||
graph_bf_af(features, "1target_rows_before")
|
graph_bf_af(features, "1target_rows_before")
|
||||||
|
# (1.0) OVERRIDE STRESSFULNESS EVENT TARGETS IF ERS TARGETS_METHOD IS "STRESS_EVENT"
|
||||||
|
if config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["TARGETS_METHOD"] == "stress_event" and \
|
||||||
|
"appraisal_stressfulness_event_mean" in config['PARAMS_FOR_ANALYSIS']['TARGET']['ALL_LABELS']:
|
||||||
|
|
||||||
# (1.0) OVERRIDE STRESSFULNESS EVENT TARGETS IF ERS SEGMENTING_METHOD IS "STRESS_EVENT"
|
|
||||||
if config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"] == "stress_event":
|
|
||||||
|
|
||||||
stress_events_targets = pd.read_csv("data/external/stress_event_targets.csv")
|
stress_events_targets = pd.read_csv("data/external/stress_event_targets.csv")
|
||||||
|
features.drop(columns=['phone_esm_straw_appraisal_stressfulness_event_mean'], inplace=True)
|
||||||
if "appraisal_stressfulness_event_mean" in config['PARAMS_FOR_ANALYSIS']['TARGET']['ALL_LABELS']:
|
features = features.merge(stress_events_targets.rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \
|
||||||
features.drop(columns=['phone_esm_straw_appraisal_stressfulness_event_mean'], inplace=True)
|
.rename(columns={'intensity': 'phone_esm_straw_appraisal_stressfulness_event_mean'})
|
||||||
features = features.merge(stress_events_targets[["label", "appraisal_stressfulness_event"]] \
|
|
||||||
.rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \
|
|
||||||
.rename(columns={'appraisal_stressfulness_event': 'phone_esm_straw_appraisal_stressfulness_event_mean'})
|
|
||||||
|
|
||||||
if "appraisal_threat_mean" in config['PARAMS_FOR_ANALYSIS']['TARGET']['ALL_LABELS']:
|
|
||||||
features.drop(columns=['phone_esm_straw_appraisal_threat_mean'], inplace=True)
|
|
||||||
features = features.merge(stress_events_targets[["label", "appraisal_threat"]] \
|
|
||||||
.rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \
|
|
||||||
.rename(columns={'appraisal_threat': 'phone_esm_straw_appraisal_threat_mean'})
|
|
||||||
|
|
||||||
if "appraisal_challenge_mean" in config['PARAMS_FOR_ANALYSIS']['TARGET']['ALL_LABELS']:
|
|
||||||
features.drop(columns=['phone_esm_straw_appraisal_challenge_mean'], inplace=True)
|
|
||||||
features = features.merge(stress_events_targets[["label", "appraisal_challenge"]] \
|
|
||||||
.rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \
|
|
||||||
.rename(columns={'appraisal_challenge': 'phone_esm_straw_appraisal_challenge_mean'})
|
|
||||||
|
|
||||||
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns
|
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns
|
||||||
|
|
||||||
|
@ -108,7 +93,7 @@ def straw_cleaning(sensor_data_files, provider, target):
|
||||||
features[impute_w_sn2] = features[impute_w_sn2].fillna(1) # Special case of imputation - nominal/ordinal value
|
features[impute_w_sn2] = features[impute_w_sn2].fillna(1) # Special case of imputation - nominal/ordinal value
|
||||||
|
|
||||||
impute_w_sn3 = [col for col in features.columns if "loglocationvariance" in col]
|
impute_w_sn3 = [col for col in features.columns if "loglocationvariance" in col]
|
||||||
features[impute_w_sn3] = features[impute_w_sn3].fillna(-1000000) # Special case of imputation - loglocation
|
features[impute_w_sn2] = features[impute_w_sn2].fillna(-1000000) # Special case of imputation - loglocation
|
||||||
|
|
||||||
# Impute location features
|
# Impute location features
|
||||||
impute_locations = [col for col in features \
|
impute_locations = [col for col in features \
|
||||||
|
@ -218,16 +203,6 @@ def straw_cleaning(sensor_data_files, provider, target):
|
||||||
|
|
||||||
graph_bf_af(features, "10correlation_drop")
|
graph_bf_af(features, "10correlation_drop")
|
||||||
|
|
||||||
# Transform categorical columns to category dtype
|
|
||||||
|
|
||||||
cat1 = [col for col in features.columns if "mostcommonactivity" in col]
|
|
||||||
if cat1: # Transform columns to category dtype (mostcommonactivity)
|
|
||||||
features[cat1] = features[cat1].astype(int).astype('category')
|
|
||||||
|
|
||||||
cat2 = [col for col in features.columns if "homelabel" in col]
|
|
||||||
if cat2: # Transform columns to category dtype (homelabel)
|
|
||||||
features[cat2] = features[cat2].astype(int).astype('category')
|
|
||||||
|
|
||||||
# (10) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME
|
# (10) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME
|
||||||
if features.isna().any().any():
|
if features.isna().any().any():
|
||||||
raise ValueError("There are still some NaNs present in the dataframe. Please check for implementation errors.")
|
raise ValueError("There are still some NaNs present in the dataframe. Please check for implementation errors.")
|
||||||
|
@ -248,7 +223,7 @@ def impute(df, method='zero'):
|
||||||
'knn': k_nearest(df)
|
'knn': k_nearest(df)
|
||||||
}[method]
|
}[method]
|
||||||
|
|
||||||
def graph_bf_af(features, phase_name, plt_flag=False):
|
def graph_bf_af(features, phase_name, plt_flag=True):
|
||||||
if plt_flag:
|
if plt_flag:
|
||||||
sns.set(rc={"figure.figsize":(16, 8)})
|
sns.set(rc={"figure.figsize":(16, 8)})
|
||||||
sns.heatmap(features.isna(), cbar=False) #features.select_dtypes(include=np.number)
|
sns.heatmap(features.isna(), cbar=False) #features.select_dtypes(include=np.number)
|
||||||
|
|
|
@ -42,8 +42,7 @@ def straw_features(sensor_data_files, time_segment, provider, filter_data_by_seg
|
||||||
requested_features = provider["FEATURES"]
|
requested_features = provider["FEATURES"]
|
||||||
# name of the features this function can compute
|
# name of the features this function can compute
|
||||||
requested_scales = provider["SCALES"]
|
requested_scales = provider["SCALES"]
|
||||||
base_features_names = ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support",
|
base_features_names = ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support", "appraisal_stressfulness_period", "appraisal_stressfulness_event"]
|
||||||
"appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"]
|
|
||||||
#TODO Check valid questionnaire and feature names.
|
#TODO Check valid questionnaire and feature names.
|
||||||
# the subset of requested features this function can compute
|
# the subset of requested features this function can compute
|
||||||
features_to_compute = list(set(requested_features) & set(base_features_names))
|
features_to_compute = list(set(requested_features) & set(base_features_names))
|
||||||
|
|
|
@ -10,15 +10,6 @@ from esm import classify_sessions_by_completion_time, preprocess_esm
|
||||||
input_data_files = dict(snakemake.input)
|
input_data_files = dict(snakemake.input)
|
||||||
|
|
||||||
def format_timestamp(x):
|
def format_timestamp(x):
|
||||||
"""This method formates inputed timestamp into format "HH MM SS". Including spaces. If there is no hours or minutes present
|
|
||||||
that part is ignored, e.g., "MM SS" or just "SS".
|
|
||||||
|
|
||||||
Args:
|
|
||||||
x (int): unix timestamp in seconds
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: formatted timestamp using "HH MM SS" sintax
|
|
||||||
"""
|
|
||||||
tstring=""
|
tstring=""
|
||||||
space = False
|
space = False
|
||||||
if x//3600 > 0:
|
if x//3600 > 0:
|
||||||
|
@ -32,23 +23,8 @@ def format_timestamp(x):
|
||||||
return tstring
|
return tstring
|
||||||
|
|
||||||
|
|
||||||
def extract_ers(esm_df):
|
def extract_ers(esm_df, device_id):
|
||||||
"""This method has two major functionalities:
|
|
||||||
(1) It prepares STRAW event-related segments file with the use of esm file. The execution protocol is depended on
|
|
||||||
the segmenting method specified in the config.yaml file.
|
|
||||||
(2) It prepares and writes csv with targets and corresponding time segments labels. This is later used
|
|
||||||
in the overall cleaning script (straw).
|
|
||||||
|
|
||||||
Details about each segmenting method are listed below by each corresponding condition. Refer to the RAPIDS documentation for the
|
|
||||||
ERS file format: https://www.rapids.science/1.9/setup/configuration/#time-segments -> event segments
|
|
||||||
|
|
||||||
Args:
|
|
||||||
esm_df (DataFrame): read esm file that is dependend on the current participant.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
extracted_ers (DataFrame): dataframe with all necessary information to write event-related segments file
|
|
||||||
in the correct format.
|
|
||||||
"""
|
|
||||||
pd.set_option("display.max_rows", 20)
|
pd.set_option("display.max_rows", 20)
|
||||||
pd.set_option("display.max_columns", None)
|
pd.set_option("display.max_columns", None)
|
||||||
|
|
||||||
|
@ -64,34 +40,23 @@ def extract_ers(esm_df):
|
||||||
esm_filtered_sessions = classified[classified["session_response"] == 'ema_completed'].reset_index()[['device_id', 'esm_session']]
|
esm_filtered_sessions = classified[classified["session_response"] == 'ema_completed'].reset_index()[['device_id', 'esm_session']]
|
||||||
esm_df = esm_preprocessed.loc[(esm_preprocessed['device_id'].isin(esm_filtered_sessions['device_id'])) & (esm_preprocessed['esm_session'].isin(esm_filtered_sessions['esm_session']))]
|
esm_df = esm_preprocessed.loc[(esm_preprocessed['device_id'].isin(esm_filtered_sessions['device_id'])) & (esm_preprocessed['esm_session'].isin(esm_filtered_sessions['esm_session']))]
|
||||||
|
|
||||||
segmenting_method = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"]
|
targets_method = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["TARGETS_METHOD"]
|
||||||
|
if targets_method in ["30_before", "90_before"]: # takes 30-minute peroid before the questionnaire + the duration of the questionnaire
|
||||||
if segmenting_method in ["30_before", "90_before"]: # takes 30-minute peroid before the questionnaire + the duration of the questionnaire
|
|
||||||
""" '30-minutes and 90-minutes before' have the same fundamental logic with couple of deviations that will be explained below.
|
|
||||||
Both take x-minute period before the questionnaire that is summed with the questionnaire duration.
|
|
||||||
All questionnaire durations over 15 minutes are excluded from the querying.
|
|
||||||
"""
|
|
||||||
# Extract time-relevant information
|
# Extract time-relevant information
|
||||||
extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index() # questionnaire length
|
extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index() # questionnaire length
|
||||||
extracted_ers["label"] = f"straw_event_{segmenting_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
|
extracted_ers["label"] = f"straw_event_{targets_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
|
||||||
extracted_ers[['event_timestamp', 'device_id']] = esm_df.groupby(["device_id", "esm_session"])['timestamp'].min().reset_index()[['timestamp', 'device_id']]
|
extracted_ers[['event_timestamp', 'device_id']] = esm_df.groupby(["device_id", "esm_session"])['timestamp'].min().reset_index()[['timestamp', 'device_id']]
|
||||||
extracted_ers = extracted_ers[extracted_ers["timestamp"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire anwsering is 15 min
|
extracted_ers = extracted_ers[extracted_ers["timestamp"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire anwsering is 15 min
|
||||||
extracted_ers["shift_direction"] = -1
|
extracted_ers["shift_direction"] = -1
|
||||||
|
|
||||||
if segmenting_method == "30_before":
|
if targets_method == "30_before":
|
||||||
"""The method 30-minutes before simply takes 30 minutes before the questionnaire and sums it with the questionnaire duration.
|
|
||||||
The timestamps are formatted with the help of format_timestamp() method.
|
|
||||||
"""
|
|
||||||
time_before_questionnaire = 30 * 60 # in seconds (30 minutes)
|
time_before_questionnaire = 30 * 60 # in seconds (30 minutes)
|
||||||
|
|
||||||
extracted_ers["length"] = (extracted_ers["timestamp"] + time_before_questionnaire).apply(lambda x: format_timestamp(x))
|
extracted_ers["length"] = (extracted_ers["timestamp"] + time_before_questionnaire).apply(lambda x: format_timestamp(x))
|
||||||
extracted_ers["shift"] = time_before_questionnaire
|
extracted_ers["shift"] = time_before_questionnaire
|
||||||
extracted_ers["shift"] = extracted_ers["shift"].apply(lambda x: format_timestamp(x))
|
extracted_ers["shift"] = extracted_ers["shift"].apply(lambda x: format_timestamp(x))
|
||||||
|
|
||||||
elif segmenting_method == "90_before":
|
elif targets_method == "90_before":
|
||||||
"""The method 90-minutes before has an important condition. If the time between the current and the previous questionnaire is
|
|
||||||
longer then 90 minutes it takes 90 minutes, otherwise it takes the original time difference between the questionnaires.
|
|
||||||
"""
|
|
||||||
time_before_questionnaire = 90 * 60 # in seconds (90 minutes)
|
time_before_questionnaire = 90 * 60 # in seconds (90 minutes)
|
||||||
|
|
||||||
extracted_ers[['end_event_timestamp', 'device_id']] = esm_df.groupby(["device_id", "esm_session"])['timestamp'].max().reset_index()[['timestamp', 'device_id']]
|
extracted_ers[['end_event_timestamp', 'device_id']] = esm_df.groupby(["device_id", "esm_session"])['timestamp'].max().reset_index()[['timestamp', 'device_id']]
|
||||||
|
@ -104,75 +69,47 @@ def extract_ers(esm_df):
|
||||||
extracted_ers["length"] = (extracted_ers["timestamp"] + extracted_ers["diffs"]).apply(lambda x: format_timestamp(x))
|
extracted_ers["length"] = (extracted_ers["timestamp"] + extracted_ers["diffs"]).apply(lambda x: format_timestamp(x))
|
||||||
extracted_ers["shift"] = extracted_ers["diffs"].apply(lambda x: format_timestamp(x))
|
extracted_ers["shift"] = extracted_ers["diffs"].apply(lambda x: format_timestamp(x))
|
||||||
|
|
||||||
elif segmenting_method == "stress_event":
|
elif targets_method == "stress_event":
|
||||||
"""This is a special case of the method as it consists of two important parts:
|
|
||||||
(1) Generating of the ERS file (same as the methods above) and
|
|
||||||
(2) Generating targets file alongside with the correct time segment labels.
|
|
||||||
|
|
||||||
This extracts event-related segments, depended on the event time and duration specified by the participant in the next
|
|
||||||
questionnaire. Additionally, 5 minutes before the specified start time of this event is taken to take into a account the
|
|
||||||
possiblity of the participant not remembering the start time percisely => this parameter can be manipulated with the variable
|
|
||||||
"time_before_event" which is defined below.
|
|
||||||
|
|
||||||
By default, this method also excludes all events that are longer then 2.5 hours so that the segments are easily comparable.
|
|
||||||
"""
|
|
||||||
# Get and join required data
|
# Get and join required data
|
||||||
extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index().rename(columns={'timestamp': 'session_length'}) # questionnaire end timestamp
|
extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index().rename(columns={'timestamp': 'session_length'}) # questionnaire end timestamp
|
||||||
extracted_ers = extracted_ers[extracted_ers["session_length"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire anwsering is 15 min
|
extracted_ers = extracted_ers[extracted_ers["session_length"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire anwsering is 15 min
|
||||||
session_end_timestamp = esm_df.groupby(['device_id', 'esm_session'])['timestamp'].max().to_frame().rename(columns={'timestamp': 'session_end_timestamp'}) # questionnaire end timestamp
|
session_end_timestamp = esm_df.groupby(['device_id', 'esm_session'])['timestamp'].max().to_frame().rename(columns={'timestamp': 'session_end_timestamp'}) # questionnaire end timestamp
|
||||||
se_time = esm_df[esm_df.questionnaire_id == 90.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_time'})
|
se_time = esm_df[esm_df.questionnaire_id == 90.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_time'})
|
||||||
se_duration = esm_df[esm_df.questionnaire_id == 91.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_duration'})
|
se_duration = esm_df[esm_df.questionnaire_id == 91.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_duration'})
|
||||||
|
se_intensity = esm_df[esm_df.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'intensity'})
|
||||||
# Extracted 3 targets that will be transfered with the csv file to the cleaning script.
|
|
||||||
se_stressfulness_event_tg = esm_df[esm_df.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_stressfulness_event'})
|
|
||||||
se_threat_tg = esm_df[esm_df.questionnaire_id == 88.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_threat'})
|
|
||||||
se_challenge_tg = esm_df[esm_df.questionnaire_id == 89.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_challenge'})
|
|
||||||
|
|
||||||
# All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count)
|
|
||||||
extracted_ers = extracted_ers.join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \
|
extracted_ers = extracted_ers.join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \
|
||||||
.join(se_time, on=['device_id', 'esm_session'], how='inner') \
|
.join(se_time, on=['device_id', 'esm_session'], how='inner') \
|
||||||
.join(se_duration, on=['device_id', 'esm_session'], how='inner') \
|
.join(se_duration, on=['device_id', 'esm_session'], how='inner') \
|
||||||
.join(se_stressfulness_event_tg, on=['device_id', 'esm_session'], how='inner') \
|
.join(se_intensity, on=['device_id', 'esm_session'], how='inner')
|
||||||
.join(se_threat_tg, on=['device_id', 'esm_session'], how='inner') \
|
|
||||||
.join(se_challenge_tg, on=['device_id', 'esm_session'], how='inner')
|
# Filter sessions that are not useful
|
||||||
|
extracted_ers = extracted_ers[(extracted_ers.se_time != "0 - Ne spomnim se") & (extracted_ers.se_duration != "0 - Ne spomnim se")]
|
||||||
|
|
||||||
# Filter sessions that are not useful. Because of the ambiguity this excludes:
|
|
||||||
# (1) straw event times that are marked as "0 - I don't remember"
|
|
||||||
# (2) straw event durations that are marked as "0 - I don't remember"
|
|
||||||
extracted_ers = extracted_ers[(~extracted_ers.se_time.str.startswith("0 - ")) & (~extracted_ers.se_duration.str.startswith("0 - "))]
|
|
||||||
|
|
||||||
# Transform data into its final form, ready for the extraction
|
# Transform data into its final form, ready for the extraction
|
||||||
extracted_ers.reset_index(drop=True, inplace=True)
|
extracted_ers.reset_index(inplace=True)
|
||||||
|
|
||||||
time_before_event = 5 * 60 # in seconds (5 minutes)
|
time_before_event = 5 * 60 # in seconds (5 minutes)
|
||||||
extracted_ers['event_timestamp'] = pd.to_datetime(extracted_ers['se_time']).apply(lambda x: x.timestamp() * 1000).astype('int64')
|
extracted_ers['event_timestamp'] = pd.to_datetime(extracted_ers['se_time']).apply(lambda x: x.timestamp() * 1000).astype('int64')
|
||||||
extracted_ers['shift_direction'] = -1
|
extracted_ers['shift_direction'] = -1
|
||||||
|
|
||||||
# Checks whether the duration is marked with "1 - It's still ongoing" which means that the end of the current questionnaire
|
|
||||||
# is taken as end time of the segment. Else the user input duration is taken.
|
|
||||||
extracted_ers['se_duration'] = \
|
extracted_ers['se_duration'] = \
|
||||||
np.where(
|
np.where(
|
||||||
extracted_ers['se_duration'].str.startswith("1 - "),
|
extracted_ers['se_duration'] == "1 - Še vedno traja",
|
||||||
extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'],
|
extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'],
|
||||||
extracted_ers['se_duration']
|
extracted_ers['se_duration']
|
||||||
)
|
)
|
||||||
|
|
||||||
# This converts the rows of timestamps in miliseconds and the row with datetime to timestamp in seconds.
|
|
||||||
extracted_ers['se_duration'] = \
|
extracted_ers['se_duration'] = \
|
||||||
extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else (pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60) + time_before_event
|
extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else (pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60) + time_before_event
|
||||||
|
|
||||||
|
extracted_ers = extracted_ers[extracted_ers["se_duration"] <= 2.5 * 60 * 60].reset_index(drop=True) # Exclude events that are longer than 2.5 hours
|
||||||
|
|
||||||
|
extracted_ers["label"] = f"straw_event_{targets_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
|
||||||
extracted_ers['shift'] = format_timestamp(time_before_event)
|
extracted_ers['shift'] = format_timestamp(time_before_event)
|
||||||
extracted_ers['length'] = extracted_ers['se_duration'].apply(lambda x: format_timestamp(x))
|
extracted_ers['length'] = extracted_ers['se_duration'].apply(lambda x: format_timestamp(x))
|
||||||
|
|
||||||
# Drop event_timestamp duplicates in case of user referencing the same event over multiple questionnaires
|
extracted_ers[["label", "intensity"]].to_csv(snakemake.output[1], index=False)
|
||||||
extracted_ers.drop_duplicates(subset=["event_timestamp"], keep='first', inplace=True)
|
|
||||||
extracted_ers.reset_index(drop=True, inplace=True)
|
|
||||||
|
|
||||||
extracted_ers["label"] = f"straw_event_{segmenting_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
|
|
||||||
|
|
||||||
# Write the csv of extracted ERS labels with targets related to stressfulness event
|
|
||||||
extracted_ers[["label", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"]].to_csv(snakemake.output[1], index=False)
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise Exception("Please select correct target method for the event-related segments.")
|
raise Exception("Please select correct target method for the event-related segments.")
|
||||||
|
@ -181,20 +118,14 @@ def extract_ers(esm_df):
|
||||||
return extracted_ers[["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"]]
|
return extracted_ers[["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"]]
|
||||||
|
|
||||||
|
|
||||||
"""
|
# Actual code execution
|
||||||
Here the code is executed - this .py file is used both for extraction of the STRAW time_segments file for the individual
|
|
||||||
participant, and also for merging all participant's files into one combined file which is later used for the time segments
|
|
||||||
to all sensors assignment.
|
|
||||||
|
|
||||||
There are two files involved (see rules extract_event_information_from_esm and merge_event_related_segments_files in preprocessing.smk)
|
|
||||||
(1) ERS file which contains all the information about the time segment timings and
|
|
||||||
(2) targets file which has corresponding target value for the segment label which is later used to merge with other features in the cleaning script.
|
|
||||||
For more information, see the comment in the method above.
|
|
||||||
"""
|
|
||||||
if snakemake.params["stage"] == "extract":
|
if snakemake.params["stage"] == "extract":
|
||||||
esm_df = pd.read_csv(input_data_files['esm_raw_input'])
|
esm_df = pd.read_csv(input_data_files['esm_raw_input'])
|
||||||
|
|
||||||
extracted_ers = extract_ers(esm_df)
|
with open(input_data_files['pid_file'], 'r') as stream:
|
||||||
|
pid_file = yaml.load(stream, Loader=yaml.FullLoader)
|
||||||
|
|
||||||
|
extracted_ers = extract_ers(esm_df, pid_file["PHONE"]["DEVICE_IDS"][0])
|
||||||
|
|
||||||
extracted_ers.to_csv(snakemake.output[0], index=False)
|
extracted_ers.to_csv(snakemake.output[0], index=False)
|
||||||
|
|
||||||
|
@ -202,7 +133,7 @@ elif snakemake.params["stage"] == "merge":
|
||||||
|
|
||||||
input_data_files = dict(snakemake.input)
|
input_data_files = dict(snakemake.input)
|
||||||
straw_events = pd.DataFrame(columns=["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"])
|
straw_events = pd.DataFrame(columns=["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"])
|
||||||
stress_events_targets = pd.DataFrame(columns=["label", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"])
|
stress_events_targets = pd.DataFrame(columns=["label", "intensity"])
|
||||||
|
|
||||||
for input_file in input_data_files["ers_files"]:
|
for input_file in input_data_files["ers_files"]:
|
||||||
ers_df = pd.read_csv(input_file)
|
ers_df = pd.read_csv(input_file)
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import sys
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
def retain_target_column(df_input: pd.DataFrame, target_variable_name: str):
|
def retain_target_column(df_input: pd.DataFrame, target_variable_name: str):
|
||||||
column_names = df_input.columns
|
column_names = df_input.columns
|
||||||
|
@ -9,9 +8,9 @@ def retain_target_column(df_input: pd.DataFrame, target_variable_name: str):
|
||||||
esm_names = column_names[esm_names_index]
|
esm_names = column_names[esm_names_index]
|
||||||
target_variable_index = esm_names.str.contains(target_variable_name)
|
target_variable_index = esm_names.str.contains(target_variable_name)
|
||||||
if all(~target_variable_index):
|
if all(~target_variable_index):
|
||||||
warnings.warn(f"The requested target (, {target_variable_name} ,)cannot be found in the dataset. Please check the names of phone_esm_ columns in cleaned python file")
|
raise ValueError("The requested target (", target_variable_name,
|
||||||
return None
|
")cannot be found in the dataset.",
|
||||||
|
"Please check the names of phone_esm_ columns in z_all_sensor_features_cleaned_straw_py.csv")
|
||||||
sensor_features_plus_target = df_input.drop(esm_names, axis=1)
|
sensor_features_plus_target = df_input.drop(esm_names, axis=1)
|
||||||
sensor_features_plus_target["target"] = df_input[esm_names[target_variable_index]]
|
sensor_features_plus_target["target"] = df_input[esm_names[target_variable_index]]
|
||||||
# We will only keep one column related to phone_esm and that will be our target variable.
|
# We will only keep one column related to phone_esm and that will be our target variable.
|
||||||
|
|
|
@ -7,7 +7,4 @@ target_variable_name = snakemake.params["target_variable"]
|
||||||
|
|
||||||
model_input = retain_target_column(cleaned_sensor_features, target_variable_name)
|
model_input = retain_target_column(cleaned_sensor_features, target_variable_name)
|
||||||
|
|
||||||
if model_input is None:
|
model_input.to_csv(snakemake.output[0], index=False)
|
||||||
pd.DataFrame().to_csv(snakemake.output[0])
|
|
||||||
else:
|
|
||||||
model_input.to_csv(snakemake.output[0], index=False)
|
|
||||||
|
|
Loading…
Reference in New Issue