diff --git a/exploration/expl_stress_event.py b/exploration/expl_stress_event.py index b2aaabc..8fc5bf1 100644 --- a/exploration/expl_stress_event.py +++ b/exploration/expl_stress_event.py @@ -75,7 +75,7 @@ extracted_ers.reset_index(drop=True, inplace=True) # Add default duration in case if participant answered that no stressful event occured # Prepare data to fit the data structure in the CSV file ... -# Add the event time as the end of the questionnaire if no stress event occured +# Add the event time as the start of the questionnaire if no stress event occured extracted_ers['se_time'] = extracted_ers['se_time'].fillna(extracted_ers['session_start_timestamp']) # Type could be an int (timestamp [ms]) which stays the same, and datetime str which is converted to timestamp in miliseconds extracted_ers['event_timestamp'] = extracted_ers['se_time'].apply(lambda x: x if isinstance(x, int) else pd.to_datetime(x).timestamp() * 1000).astype('int64') @@ -102,7 +102,7 @@ extracted_ers['se_duration'] = \ """>>>>> end section <<<<<""" -# %% +# %% [markdown] # Count negative values of duration print("Count all:", extracted_ers[['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0]) print("Count stressed:", extracted_ers[(~extracted_ers['se_duration'].isna())][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0]) @@ -111,14 +111,12 @@ print("Count 0 durations:", extracted_ers[extracted_ers['se_duration'] == 0][['s extracted_ers[extracted_ers['se_duration'] <= 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0] extracted_ers[(~extracted_ers['se_duration'].isna()) & (extracted_ers['se_duration'] <= 0)][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']] -ax = extracted_ers[(extracted_ers['se_duration'] < 5000) & (extracted_ers['se_duration'] > -300)].hist(column='se_duration', bins='auto', grid=False, figsize=(12,8), color='#86bf91', zorder=2, rwidth=0.9) -extracted_ers[(extracted_ers['se_duration'] < 1000) & (extracted_ers['se_duration'] > -1000)]['se_duration'].value_counts() +ax = extracted_ers.hist(column='se_duration', bins='auto', grid=False, figsize=(12,8), color='#86bf91', zorder=2, rwidth=0.9) hist, bin_edges = np.histogram(extracted_ers['se_duration'].dropna()) hist bin_edges -extracted_ers['se_duration'].describe() -extracted_ers['se_duration'].median() +extracted_ers = extracted_ers[extracted_ers['se_duration'] >= 0] # %% # bins = [-100000000, 0, 0.0000001, 1200, 7200, 100000000] #'neg', 'zero', '<20min', '2h', 'high_pos' ..... right=False @@ -131,15 +129,23 @@ sns.displot( binwidth=0.1, ) -# %% +# %% [markdown] +extracted_ers[extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'] >= 0] +extracted_ers['se_time'].value_counts() +pd.set_option('display.max_rows', 100) # Tukaj nas zanima, koliko so oddaljeni časi stresnega dogodka od konca vprašalnika. -extracted_ers = extracted_ers[~extracted_ers['se_duration'].isna()] -extracted_ers[['session_end_timestamp', 'event_timestamp']] +extracted_ers = extracted_ers[~extracted_ers['se_duration'].isna()] # Remove no stress events extracted_ers['diff_se_time_session_end'] = (extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp']) -extracted_ers['diff_se_time_session_end'].dropna().value_counts() -extracted_ers = extracted_ers[(extracted_ers['diff_se_time_session_end'] > 0)] -bins2 = [-0.0000001, 0, 300, 600, 1200, 3600, 7200, 14400, 1000000000] # 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more' -extracted_ers['bins2'], edges = pd.cut(extracted_ers.diff_se_time_session_end, bins=bins2, labels=['zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'], retbins=True, right=True) #['low', 'medium', 'high'] + +print("Count all:", extracted_ers[['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']].shape[0]) +print("Count negative durations:", extracted_ers[extracted_ers['diff_se_time_session_end'] < 0][['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']]) +print("Count 0 durations:", extracted_ers[extracted_ers['diff_se_time_session_end'] == 0][['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']].shape[0]) + +extracted_ers[extracted_ers['diff_se_time_session_end'] < 0]['diff_se_time_session_end'] +# extracted_ers = extracted_ers[(extracted_ers['diff_se_time_session_end'] > 0)] +bins2 = [-100000, 0, 300, 600, 1200, 3600, 7200, 14400, 1000000000] # 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more' +extracted_ers['bins2'], edges = pd.cut(extracted_ers.diff_se_time_session_end, bins=bins2, labels=['neg_zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'], retbins=True, right=True) #['low', 'medium', 'high'] +extracted_ers['bins2'] sns.displot( data=extracted_ers.dropna(), x="bins2", @@ -149,6 +155,8 @@ sns.displot( extracted_ers.shape extracted_ers.dropna().shape +print() + # %% extracted_ers['appraisal_stressfulness_event_num'] = extracted_ers['appraisal_stressfulness_event'].str[0].astype(int) diff --git a/exploration/ml_pipeline_classification.py b/exploration/ml_pipeline_classification.py index 736e3db..0b5dc15 100644 --- a/exploration/ml_pipeline_classification.py +++ b/exploration/ml_pipeline_classification.py @@ -41,12 +41,12 @@ if nb_dir not in sys.path: # %% [markdown] # ## Set script's parameters -cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter) +cv_method_str = '5kfold' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter) n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs undersampling = True # (bool) If True this will train and test data on balanced dataset (using undersampling method) # %% jupyter={"source_hidden": true} -model_input = pd.read_csv("../data/stressfulness_event_with_target_0/input_appraisal_stressfulness_event_mean.csv") +model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv") # model_input = model_input[model_input.columns.drop(list(model_input.filter(regex='empatica_temperature')))] # %% jupyter={"source_hidden": true} @@ -334,15 +334,15 @@ print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], # %% jupyter={"source_hidden": true} rfc_es_fimp = pd.DataFrame(columns=list(train_x.columns)) for idx, estimator in enumerate(rfc_scores['estimator']): - print("\nFeatures sorted by their score for estimator {}:".format(idx)) feature_importances = pd.DataFrame(estimator.feature_importances_, index = list(train_x.columns), columns=['importance']) - print(feature_importances.sort_values('importance', ascending=False).head(10)) + # print("\nFeatures sorted by their score for estimator {}:".format(idx)) + # print(feature_importances.sort_values('importance', ascending=False).head(10)) rfc_es_fimp = pd.concat([rfc_es_fimp, feature_importances]).groupby(level=0).mean() pd.set_option('display.max_rows', 100) -print(rfc_es_fimp.sort_values('importance', ascending=False).head(100)) +print(rfc_es_fimp.sort_values('importance', ascending=False).head(30)) rfc_es_fimp.sort_values('importance', ascending=False).head(30).plot.bar() diff --git a/rapids b/rapids index 8a6b52a..7f5a4e6 160000 --- a/rapids +++ b/rapids @@ -1 +1 @@ -Subproject commit 8a6b52a97c95dcd8b70b980b4f46421b1a847905 +Subproject commit 7f5a4e6744e502d40dc38502e1e74bd2bf9fe786