Add expl stress event script and other changes.

ml_pipeline
Primoz 2022-12-21 15:02:25 +01:00
parent adcb823d3f
commit 339142ff31
3 changed files with 27 additions and 19 deletions

View File

@ -75,7 +75,7 @@ extracted_ers.reset_index(drop=True, inplace=True)
# Add default duration in case if participant answered that no stressful event occured # Add default duration in case if participant answered that no stressful event occured
# Prepare data to fit the data structure in the CSV file ... # Prepare data to fit the data structure in the CSV file ...
# Add the event time as the end of the questionnaire if no stress event occured # Add the event time as the start of the questionnaire if no stress event occured
extracted_ers['se_time'] = extracted_ers['se_time'].fillna(extracted_ers['session_start_timestamp']) extracted_ers['se_time'] = extracted_ers['se_time'].fillna(extracted_ers['session_start_timestamp'])
# Type could be an int (timestamp [ms]) which stays the same, and datetime str which is converted to timestamp in miliseconds # Type could be an int (timestamp [ms]) which stays the same, and datetime str which is converted to timestamp in miliseconds
extracted_ers['event_timestamp'] = extracted_ers['se_time'].apply(lambda x: x if isinstance(x, int) else pd.to_datetime(x).timestamp() * 1000).astype('int64') extracted_ers['event_timestamp'] = extracted_ers['se_time'].apply(lambda x: x if isinstance(x, int) else pd.to_datetime(x).timestamp() * 1000).astype('int64')
@ -102,7 +102,7 @@ extracted_ers['se_duration'] = \
""">>>>> end section <<<<<""" """>>>>> end section <<<<<"""
# %% # %% [markdown]
# Count negative values of duration # Count negative values of duration
print("Count all:", extracted_ers[['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0]) print("Count all:", extracted_ers[['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
print("Count stressed:", extracted_ers[(~extracted_ers['se_duration'].isna())][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0]) print("Count stressed:", extracted_ers[(~extracted_ers['se_duration'].isna())][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
@ -111,14 +111,12 @@ print("Count 0 durations:", extracted_ers[extracted_ers['se_duration'] == 0][['s
extracted_ers[extracted_ers['se_duration'] <= 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0] extracted_ers[extracted_ers['se_duration'] <= 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0]
extracted_ers[(~extracted_ers['se_duration'].isna()) & (extracted_ers['se_duration'] <= 0)][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']] extracted_ers[(~extracted_ers['se_duration'].isna()) & (extracted_ers['se_duration'] <= 0)][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']]
ax = extracted_ers[(extracted_ers['se_duration'] < 5000) & (extracted_ers['se_duration'] > -300)].hist(column='se_duration', bins='auto', grid=False, figsize=(12,8), color='#86bf91', zorder=2, rwidth=0.9) ax = extracted_ers.hist(column='se_duration', bins='auto', grid=False, figsize=(12,8), color='#86bf91', zorder=2, rwidth=0.9)
extracted_ers[(extracted_ers['se_duration'] < 1000) & (extracted_ers['se_duration'] > -1000)]['se_duration'].value_counts()
hist, bin_edges = np.histogram(extracted_ers['se_duration'].dropna()) hist, bin_edges = np.histogram(extracted_ers['se_duration'].dropna())
hist hist
bin_edges bin_edges
extracted_ers['se_duration'].describe() extracted_ers = extracted_ers[extracted_ers['se_duration'] >= 0]
extracted_ers['se_duration'].median()
# %% # %%
# bins = [-100000000, 0, 0.0000001, 1200, 7200, 100000000] #'neg', 'zero', '<20min', '2h', 'high_pos' ..... right=False # bins = [-100000000, 0, 0.0000001, 1200, 7200, 100000000] #'neg', 'zero', '<20min', '2h', 'high_pos' ..... right=False
@ -131,15 +129,23 @@ sns.displot(
binwidth=0.1, binwidth=0.1,
) )
# %% # %% [markdown]
extracted_ers[extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'] >= 0]
extracted_ers['se_time'].value_counts()
pd.set_option('display.max_rows', 100)
# Tukaj nas zanima, koliko so oddaljeni časi stresnega dogodka od konca vprašalnika. # Tukaj nas zanima, koliko so oddaljeni časi stresnega dogodka od konca vprašalnika.
extracted_ers = extracted_ers[~extracted_ers['se_duration'].isna()] extracted_ers = extracted_ers[~extracted_ers['se_duration'].isna()] # Remove no stress events
extracted_ers[['session_end_timestamp', 'event_timestamp']]
extracted_ers['diff_se_time_session_end'] = (extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp']) extracted_ers['diff_se_time_session_end'] = (extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'])
extracted_ers['diff_se_time_session_end'].dropna().value_counts()
extracted_ers = extracted_ers[(extracted_ers['diff_se_time_session_end'] > 0)] print("Count all:", extracted_ers[['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']].shape[0])
bins2 = [-0.0000001, 0, 300, 600, 1200, 3600, 7200, 14400, 1000000000] # 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more' print("Count negative durations:", extracted_ers[extracted_ers['diff_se_time_session_end'] < 0][['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']])
extracted_ers['bins2'], edges = pd.cut(extracted_ers.diff_se_time_session_end, bins=bins2, labels=['zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'], retbins=True, right=True) #['low', 'medium', 'high'] print("Count 0 durations:", extracted_ers[extracted_ers['diff_se_time_session_end'] == 0][['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']].shape[0])
extracted_ers[extracted_ers['diff_se_time_session_end'] < 0]['diff_se_time_session_end']
# extracted_ers = extracted_ers[(extracted_ers['diff_se_time_session_end'] > 0)]
bins2 = [-100000, 0, 300, 600, 1200, 3600, 7200, 14400, 1000000000] # 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'
extracted_ers['bins2'], edges = pd.cut(extracted_ers.diff_se_time_session_end, bins=bins2, labels=['neg_zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'], retbins=True, right=True) #['low', 'medium', 'high']
extracted_ers['bins2']
sns.displot( sns.displot(
data=extracted_ers.dropna(), data=extracted_ers.dropna(),
x="bins2", x="bins2",
@ -149,6 +155,8 @@ sns.displot(
extracted_ers.shape extracted_ers.shape
extracted_ers.dropna().shape extracted_ers.dropna().shape
print()
# %% # %%
extracted_ers['appraisal_stressfulness_event_num'] = extracted_ers['appraisal_stressfulness_event'].str[0].astype(int) extracted_ers['appraisal_stressfulness_event_num'] = extracted_ers['appraisal_stressfulness_event'].str[0].astype(int)

View File

@ -41,12 +41,12 @@ if nb_dir not in sys.path:
# %% [markdown] # %% [markdown]
# ## Set script's parameters # ## Set script's parameters
cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter) cv_method_str = '5kfold' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
undersampling = True # (bool) If True this will train and test data on balanced dataset (using undersampling method) undersampling = True # (bool) If True this will train and test data on balanced dataset (using undersampling method)
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
model_input = pd.read_csv("../data/stressfulness_event_with_target_0/input_appraisal_stressfulness_event_mean.csv") model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv")
# model_input = model_input[model_input.columns.drop(list(model_input.filter(regex='empatica_temperature')))] # model_input = model_input[model_input.columns.drop(list(model_input.filter(regex='empatica_temperature')))]
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
@ -334,15 +334,15 @@ print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'],
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
rfc_es_fimp = pd.DataFrame(columns=list(train_x.columns)) rfc_es_fimp = pd.DataFrame(columns=list(train_x.columns))
for idx, estimator in enumerate(rfc_scores['estimator']): for idx, estimator in enumerate(rfc_scores['estimator']):
print("\nFeatures sorted by their score for estimator {}:".format(idx))
feature_importances = pd.DataFrame(estimator.feature_importances_, feature_importances = pd.DataFrame(estimator.feature_importances_,
index = list(train_x.columns), index = list(train_x.columns),
columns=['importance']) columns=['importance'])
print(feature_importances.sort_values('importance', ascending=False).head(10)) # print("\nFeatures sorted by their score for estimator {}:".format(idx))
# print(feature_importances.sort_values('importance', ascending=False).head(10))
rfc_es_fimp = pd.concat([rfc_es_fimp, feature_importances]).groupby(level=0).mean() rfc_es_fimp = pd.concat([rfc_es_fimp, feature_importances]).groupby(level=0).mean()
pd.set_option('display.max_rows', 100) pd.set_option('display.max_rows', 100)
print(rfc_es_fimp.sort_values('importance', ascending=False).head(100)) print(rfc_es_fimp.sort_values('importance', ascending=False).head(30))
rfc_es_fimp.sort_values('importance', ascending=False).head(30).plot.bar() rfc_es_fimp.sort_values('importance', ascending=False).head(30).plot.bar()

2
rapids

@ -1 +1 @@
Subproject commit 8a6b52a97c95dcd8b70b980b4f46421b1a847905 Subproject commit 7f5a4e6744e502d40dc38502e1e74bd2bf9fe786