Add expl stress event script and other changes.
parent
adcb823d3f
commit
339142ff31
|
@ -75,7 +75,7 @@ extracted_ers.reset_index(drop=True, inplace=True)
|
||||||
# Add default duration in case if participant answered that no stressful event occured
|
# Add default duration in case if participant answered that no stressful event occured
|
||||||
|
|
||||||
# Prepare data to fit the data structure in the CSV file ...
|
# Prepare data to fit the data structure in the CSV file ...
|
||||||
# Add the event time as the end of the questionnaire if no stress event occured
|
# Add the event time as the start of the questionnaire if no stress event occured
|
||||||
extracted_ers['se_time'] = extracted_ers['se_time'].fillna(extracted_ers['session_start_timestamp'])
|
extracted_ers['se_time'] = extracted_ers['se_time'].fillna(extracted_ers['session_start_timestamp'])
|
||||||
# Type could be an int (timestamp [ms]) which stays the same, and datetime str which is converted to timestamp in miliseconds
|
# Type could be an int (timestamp [ms]) which stays the same, and datetime str which is converted to timestamp in miliseconds
|
||||||
extracted_ers['event_timestamp'] = extracted_ers['se_time'].apply(lambda x: x if isinstance(x, int) else pd.to_datetime(x).timestamp() * 1000).astype('int64')
|
extracted_ers['event_timestamp'] = extracted_ers['se_time'].apply(lambda x: x if isinstance(x, int) else pd.to_datetime(x).timestamp() * 1000).astype('int64')
|
||||||
|
@ -102,7 +102,7 @@ extracted_ers['se_duration'] = \
|
||||||
|
|
||||||
""">>>>> end section <<<<<"""
|
""">>>>> end section <<<<<"""
|
||||||
|
|
||||||
# %%
|
# %% [markdown]
|
||||||
# Count negative values of duration
|
# Count negative values of duration
|
||||||
print("Count all:", extracted_ers[['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
|
print("Count all:", extracted_ers[['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
|
||||||
print("Count stressed:", extracted_ers[(~extracted_ers['se_duration'].isna())][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
|
print("Count stressed:", extracted_ers[(~extracted_ers['se_duration'].isna())][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
|
||||||
|
@ -111,14 +111,12 @@ print("Count 0 durations:", extracted_ers[extracted_ers['se_duration'] == 0][['s
|
||||||
extracted_ers[extracted_ers['se_duration'] <= 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0]
|
extracted_ers[extracted_ers['se_duration'] <= 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0]
|
||||||
extracted_ers[(~extracted_ers['se_duration'].isna()) & (extracted_ers['se_duration'] <= 0)][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']]
|
extracted_ers[(~extracted_ers['se_duration'].isna()) & (extracted_ers['se_duration'] <= 0)][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']]
|
||||||
|
|
||||||
ax = extracted_ers[(extracted_ers['se_duration'] < 5000) & (extracted_ers['se_duration'] > -300)].hist(column='se_duration', bins='auto', grid=False, figsize=(12,8), color='#86bf91', zorder=2, rwidth=0.9)
|
ax = extracted_ers.hist(column='se_duration', bins='auto', grid=False, figsize=(12,8), color='#86bf91', zorder=2, rwidth=0.9)
|
||||||
extracted_ers[(extracted_ers['se_duration'] < 1000) & (extracted_ers['se_duration'] > -1000)]['se_duration'].value_counts()
|
|
||||||
hist, bin_edges = np.histogram(extracted_ers['se_duration'].dropna())
|
hist, bin_edges = np.histogram(extracted_ers['se_duration'].dropna())
|
||||||
hist
|
hist
|
||||||
bin_edges
|
bin_edges
|
||||||
|
|
||||||
extracted_ers['se_duration'].describe()
|
extracted_ers = extracted_ers[extracted_ers['se_duration'] >= 0]
|
||||||
extracted_ers['se_duration'].median()
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
# bins = [-100000000, 0, 0.0000001, 1200, 7200, 100000000] #'neg', 'zero', '<20min', '2h', 'high_pos' ..... right=False
|
# bins = [-100000000, 0, 0.0000001, 1200, 7200, 100000000] #'neg', 'zero', '<20min', '2h', 'high_pos' ..... right=False
|
||||||
|
@ -131,15 +129,23 @@ sns.displot(
|
||||||
binwidth=0.1,
|
binwidth=0.1,
|
||||||
)
|
)
|
||||||
|
|
||||||
# %%
|
# %% [markdown]
|
||||||
|
extracted_ers[extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'] >= 0]
|
||||||
|
extracted_ers['se_time'].value_counts()
|
||||||
|
pd.set_option('display.max_rows', 100)
|
||||||
# Tukaj nas zanima, koliko so oddaljeni časi stresnega dogodka od konca vprašalnika.
|
# Tukaj nas zanima, koliko so oddaljeni časi stresnega dogodka od konca vprašalnika.
|
||||||
extracted_ers = extracted_ers[~extracted_ers['se_duration'].isna()]
|
extracted_ers = extracted_ers[~extracted_ers['se_duration'].isna()] # Remove no stress events
|
||||||
extracted_ers[['session_end_timestamp', 'event_timestamp']]
|
|
||||||
extracted_ers['diff_se_time_session_end'] = (extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'])
|
extracted_ers['diff_se_time_session_end'] = (extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'])
|
||||||
extracted_ers['diff_se_time_session_end'].dropna().value_counts()
|
|
||||||
extracted_ers = extracted_ers[(extracted_ers['diff_se_time_session_end'] > 0)]
|
print("Count all:", extracted_ers[['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']].shape[0])
|
||||||
bins2 = [-0.0000001, 0, 300, 600, 1200, 3600, 7200, 14400, 1000000000] # 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'
|
print("Count negative durations:", extracted_ers[extracted_ers['diff_se_time_session_end'] < 0][['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']])
|
||||||
extracted_ers['bins2'], edges = pd.cut(extracted_ers.diff_se_time_session_end, bins=bins2, labels=['zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'], retbins=True, right=True) #['low', 'medium', 'high']
|
print("Count 0 durations:", extracted_ers[extracted_ers['diff_se_time_session_end'] == 0][['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']].shape[0])
|
||||||
|
|
||||||
|
extracted_ers[extracted_ers['diff_se_time_session_end'] < 0]['diff_se_time_session_end']
|
||||||
|
# extracted_ers = extracted_ers[(extracted_ers['diff_se_time_session_end'] > 0)]
|
||||||
|
bins2 = [-100000, 0, 300, 600, 1200, 3600, 7200, 14400, 1000000000] # 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'
|
||||||
|
extracted_ers['bins2'], edges = pd.cut(extracted_ers.diff_se_time_session_end, bins=bins2, labels=['neg_zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'], retbins=True, right=True) #['low', 'medium', 'high']
|
||||||
|
extracted_ers['bins2']
|
||||||
sns.displot(
|
sns.displot(
|
||||||
data=extracted_ers.dropna(),
|
data=extracted_ers.dropna(),
|
||||||
x="bins2",
|
x="bins2",
|
||||||
|
@ -149,6 +155,8 @@ sns.displot(
|
||||||
extracted_ers.shape
|
extracted_ers.shape
|
||||||
extracted_ers.dropna().shape
|
extracted_ers.dropna().shape
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
extracted_ers['appraisal_stressfulness_event_num'] = extracted_ers['appraisal_stressfulness_event'].str[0].astype(int)
|
extracted_ers['appraisal_stressfulness_event_num'] = extracted_ers['appraisal_stressfulness_event'].str[0].astype(int)
|
||||||
|
|
|
@ -41,12 +41,12 @@ if nb_dir not in sys.path:
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# ## Set script's parameters
|
# ## Set script's parameters
|
||||||
cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
|
cv_method_str = '5kfold' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
|
||||||
n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||||
undersampling = True # (bool) If True this will train and test data on balanced dataset (using undersampling method)
|
undersampling = True # (bool) If True this will train and test data on balanced dataset (using undersampling method)
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
model_input = pd.read_csv("../data/stressfulness_event_with_target_0/input_appraisal_stressfulness_event_mean.csv")
|
model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv")
|
||||||
# model_input = model_input[model_input.columns.drop(list(model_input.filter(regex='empatica_temperature')))]
|
# model_input = model_input[model_input.columns.drop(list(model_input.filter(regex='empatica_temperature')))]
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
|
@ -334,15 +334,15 @@ print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'],
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
rfc_es_fimp = pd.DataFrame(columns=list(train_x.columns))
|
rfc_es_fimp = pd.DataFrame(columns=list(train_x.columns))
|
||||||
for idx, estimator in enumerate(rfc_scores['estimator']):
|
for idx, estimator in enumerate(rfc_scores['estimator']):
|
||||||
print("\nFeatures sorted by their score for estimator {}:".format(idx))
|
|
||||||
feature_importances = pd.DataFrame(estimator.feature_importances_,
|
feature_importances = pd.DataFrame(estimator.feature_importances_,
|
||||||
index = list(train_x.columns),
|
index = list(train_x.columns),
|
||||||
columns=['importance'])
|
columns=['importance'])
|
||||||
print(feature_importances.sort_values('importance', ascending=False).head(10))
|
# print("\nFeatures sorted by their score for estimator {}:".format(idx))
|
||||||
|
# print(feature_importances.sort_values('importance', ascending=False).head(10))
|
||||||
rfc_es_fimp = pd.concat([rfc_es_fimp, feature_importances]).groupby(level=0).mean()
|
rfc_es_fimp = pd.concat([rfc_es_fimp, feature_importances]).groupby(level=0).mean()
|
||||||
|
|
||||||
pd.set_option('display.max_rows', 100)
|
pd.set_option('display.max_rows', 100)
|
||||||
print(rfc_es_fimp.sort_values('importance', ascending=False).head(100))
|
print(rfc_es_fimp.sort_values('importance', ascending=False).head(30))
|
||||||
|
|
||||||
rfc_es_fimp.sort_values('importance', ascending=False).head(30).plot.bar()
|
rfc_es_fimp.sort_values('importance', ascending=False).head(30).plot.bar()
|
||||||
|
|
||||||
|
|
2
rapids
2
rapids
|
@ -1 +1 @@
|
||||||
Subproject commit 8a6b52a97c95dcd8b70b980b4f46421b1a847905
|
Subproject commit 7f5a4e6744e502d40dc38502e1e74bd2bf9fe786
|
Loading…
Reference in New Issue