Merge branch 'ml_pipeline' of repo.ijs.si:junoslukan/straw2analysis into ml_pipeline

# Conflicts:
#	.gitignore
#	exploration/ml_pipeline_classification_with_clustering.py
ml_pipeline
junos 2023-01-04 18:19:43 +01:00
commit 8bbe0b2ba8
7 changed files with 246 additions and 53 deletions

1
.gitignore vendored
View File

@ -12,6 +12,7 @@ __pycache__/
/data/*input*.csv /data/*input*.csv
/data/daily* /data/daily*
/data/intradaily* /data/intradaily*
/data/stressfulness_event*
/data/30min* /data/30min*
/presentation/*scores.csv /presentation/*scores.csv
/presentation/Results.ods /presentation/Results.ods

View File

@ -7,6 +7,7 @@ dependencies:
- black - black
- isort - isort
- flake8 - flake8
- imbalanced-learn=0.10.0
- jupyterlab - jupyterlab
- jupytext - jupytext
- lightgbm - lightgbm

View File

@ -0,0 +1,166 @@
# -*- coding: utf-8 -*-
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %%
import os
import sys
import datetime
import math
import seaborn as sns
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
import participants.query_db
from features.esm import *
from features.esm_JCQ import *
from features.esm_SAM import *
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# %%
participants_inactive_usernames = participants.query_db.get_usernames(
collection_start=datetime.date.fromisoformat("2020-08-01")
)
df_esm_inactive = get_esm_data(participants_inactive_usernames)
# %%
df_esm_preprocessed = preprocess_esm(df_esm_inactive)
# %% [markdown]
# Investigate stressfulness events
# %%
extracted_ers = df_esm_preprocessed.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index().rename(columns={'timestamp': 'session_length'}) # questionnaire length
extracted_ers = extracted_ers[extracted_ers["session_length"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire answering is 15 min
session_start_timestamp = df_esm_preprocessed.groupby(['device_id', 'esm_session'])['timestamp'].min().to_frame().rename(columns={'timestamp': 'session_start_timestamp'}) # questionnaire start timestamp
session_end_timestamp = df_esm_preprocessed.groupby(['device_id', 'esm_session'])['timestamp'].max().to_frame().rename(columns={'timestamp': 'session_end_timestamp'}) # questionnaire end timestamp
se_time = df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 90.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_time'})
se_duration = df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 91.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_duration'})
# Make se_durations to the appropriate lengths
# Extracted 3 targets that will be transfered in the csv file to the cleaning script.
df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 87.].columns
se_stressfulness_event_tg = df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'appraisal_stressfulness_event'})
# All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count)
extracted_ers = extracted_ers.join(session_start_timestamp, on=['device_id', 'esm_session'], how='inner') \
.join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \
.join(se_stressfulness_event_tg, on=['device_id', 'esm_session'], how='inner') \
.join(se_time, on=['device_id', 'esm_session'], how='left') \
.join(se_duration, on=['device_id', 'esm_session'], how='left') \
# Filter-out the sessions that are not useful. Because of the ambiguity this excludes:
# (1) straw event times that are marked as "0 - I don't remember"
# (2) straw event durations that are marked as "0 - I don't remember"
extracted_ers = extracted_ers[(~extracted_ers.se_time.astype(str).str.startswith("0 - ")) & (~extracted_ers.se_duration.astype(str).str.startswith("0 - ")) & (~extracted_ers.se_duration.astype(str).str.startswith("Removed "))]
extracted_ers.reset_index(drop=True, inplace=True)
# Add default duration in case if participant answered that no stressful event occured
# Prepare data to fit the data structure in the CSV file ...
# Add the event time as the start of the questionnaire if no stress event occured
extracted_ers['se_time'] = extracted_ers['se_time'].fillna(extracted_ers['session_start_timestamp'])
# Type could be an int (timestamp [ms]) which stays the same, and datetime str which is converted to timestamp in miliseconds
extracted_ers['event_timestamp'] = extracted_ers['se_time'].apply(lambda x: x if isinstance(x, int) else pd.to_datetime(x).timestamp() * 1000).astype('int64')
extracted_ers['shift_direction'] = -1
""">>>>> begin section (could be optimized) <<<<<"""
# Checks whether the duration is marked with "1 - It's still ongoing" which means that the end of the current questionnaire
# is taken as end time of the segment. Else the user input duration is taken.
extracted_ers['temp_duration'] = extracted_ers['se_duration']
extracted_ers['se_duration'] = \
np.where(
extracted_ers['se_duration'].astype(str).str.startswith("1 - "),
extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'],
extracted_ers['se_duration']
)
# This converts the rows of timestamps in miliseconds and the rows with datetime... to timestamp in seconds.
extracted_ers['se_duration'] = \
extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else abs(pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60)
# Check whether min se_duration is at least the same duration as the ioi. Filter-out the rest.
""">>>>> end section <<<<<"""
# %% [markdown]
# Count negative values of duration
print("Count all:", extracted_ers[['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
print("Count stressed:", extracted_ers[(~extracted_ers['se_duration'].isna())][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
print("Count negative durations (invalid se_time user input):", extracted_ers[extracted_ers['se_duration'] < 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
print("Count 0 durations:", extracted_ers[extracted_ers['se_duration'] == 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
extracted_ers[extracted_ers['se_duration'] <= 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0]
extracted_ers[(~extracted_ers['se_duration'].isna()) & (extracted_ers['se_duration'] <= 0)][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']]
ax = extracted_ers.hist(column='se_duration', bins='auto', grid=False, figsize=(12,8), color='#86bf91', zorder=2, rwidth=0.9)
hist, bin_edges = np.histogram(extracted_ers['se_duration'].dropna())
hist
bin_edges
extracted_ers = extracted_ers[extracted_ers['se_duration'] >= 0]
# %%
# bins = [-100000000, 0, 0.0000001, 1200, 7200, 100000000] #'neg', 'zero', '<20min', '2h', 'high_pos' ..... right=False
bins = [-100000000, -0.0000001, 0, 300, 600, 1200, 3600, 7200, 14400, 1000000000] # 'neg', 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'
extracted_ers['bins'], edges = pd.cut(extracted_ers.se_duration, bins=bins, labels=['neg', 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'], retbins=True, right=True) #['low', 'medium', 'high']
sns.displot(
data=extracted_ers.dropna(),
x="bins",
binwidth=0.1,
)
# %% [markdown]
extracted_ers[extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'] >= 0]
extracted_ers['se_time'].value_counts()
pd.set_option('display.max_rows', 100)
# Tukaj nas zanima, koliko so oddaljeni časi stresnega dogodka od konca vprašalnika.
extracted_ers = extracted_ers[~extracted_ers['se_duration'].isna()] # Remove no stress events
extracted_ers['diff_se_time_session_end'] = (extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'])
print("Count all:", extracted_ers[['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']].shape[0])
print("Count negative durations:", extracted_ers[extracted_ers['diff_se_time_session_end'] < 0][['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']])
print("Count 0 durations:", extracted_ers[extracted_ers['diff_se_time_session_end'] == 0][['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']].shape[0])
extracted_ers[extracted_ers['diff_se_time_session_end'] < 0]['diff_se_time_session_end']
# extracted_ers = extracted_ers[(extracted_ers['diff_se_time_session_end'] > 0)]
bins2 = [-100000, 0, 300, 600, 1200, 3600, 7200, 14400, 1000000000] # 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'
extracted_ers['bins2'], edges = pd.cut(extracted_ers.diff_se_time_session_end, bins=bins2, labels=['neg_zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'], retbins=True, right=True) #['low', 'medium', 'high']
extracted_ers['bins2']
sns.displot(
data=extracted_ers.dropna(),
x="bins2",
binwidth=0.1,
)
extracted_ers.shape
extracted_ers.dropna().shape
print()
# %%
extracted_ers['appraisal_stressfulness_event_num'] = extracted_ers['appraisal_stressfulness_event'].str[0].astype(int)
print("duration-target (corr):", extracted_ers['se_duration'].corr(extracted_ers['appraisal_stressfulness_event_num']))
# %%
# Explore groupby participants?

View File

@ -15,18 +15,15 @@
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
# %matplotlib inline # %matplotlib inline
import datetime
import importlib
import os import os
import sys import sys
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
import seaborn as sns
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
from sklearn.model_selection import LeaveOneGroupOut, cross_validate from sklearn.model_selection import LeaveOneGroupOut, cross_validate, StratifiedKFold
from sklearn.dummy import DummyClassifier from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer from sklearn.impute import SimpleImputer
@ -39,19 +36,18 @@ nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path: if nb_dir not in sys.path:
sys.path.append(nb_dir) sys.path.append(nb_dir)
import machine_learning.labels
import machine_learning.model
# %% [markdown] # %% [markdown]
# # RAPIDS models # # RAPIDS models
# %% [markdown] # %% [markdown]
# ## Set script's parameters # ## Set script's parameters
cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter) cv_method_str = '5kfold' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
undersampling = True # (bool) If True this will train and test data on balanced dataset (using undersampling method)
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
model_input = pd.read_csv("../data/stressfulness_event_nonstandardized/input_appraisal_stressfulness_event_mean.csv") model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv")
# model_input = model_input[model_input.columns.drop(list(model_input.filter(regex='empatica_temperature')))]
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
@ -59,8 +55,8 @@ model_input.set_index(index_columns, inplace=True)
model_input['target'].value_counts() model_input['target'].value_counts()
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
# bins = [-10, -1, 1, 10] # bins for z-scored targets # bins = [-10, 0, 10] # bins for z-scored targets
bins = [0, 1, 4] # bins for stressfulness (1-4) target bins = [-1, 0, 4] # bins for stressfulness (0-4) target
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high'] model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
model_input['target'].value_counts(), edges model_input['target'].value_counts(), edges
# model_input = model_input[model_input['target'] != "medium"] # model_input = model_input[model_input['target'] != "medium"]
@ -68,7 +64,21 @@ model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x
model_input['target'].value_counts() model_input['target'].value_counts()
if cv_method_str == 'halflogo': # %% jupyter={"source_hidden": true}
# UnderSampling
if undersampling:
model_input.groupby("pid").count()
no_stress = model_input[model_input['target'] == 0]
stress = model_input[model_input['target'] == 1]
no_stress = no_stress.sample(n=len(stress))
model_input = pd.concat([stress,no_stress], axis=0)
model_input["target"].value_counts()
# %% jupyter={"source_hidden": true}
if cv_method_str == 'half_logo':
model_input['pid_index'] = model_input.groupby('pid').cumcount() model_input['pid_index'] = model_input.groupby('pid').cumcount()
model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count') model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
@ -101,7 +111,7 @@ train_x = pd.concat([numerical_features, categorical_features], axis=1)
train_x.dtypes train_x.dtypes
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
cv_method = None # Defaults to 5 k-folds in cross_validate method cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
if cv_method_str == 'logo' or cv_method_str == 'half_logo': if cv_method_str == 'logo' or cv_method_str == 'half_logo':
cv_method = LeaveOneGroupOut() cv_method = LeaveOneGroupOut()
cv_method.get_n_splits( cv_method.get_n_splits(
@ -126,11 +136,12 @@ dummy_classifier = cross_validate(
cv=cv_method, cv=cv_method,
n_jobs=-1, n_jobs=-1,
error_score='raise', error_score='raise',
scoring=('accuracy', 'average_precision', 'recall', 'f1') scoring=('accuracy', 'precision', 'recall', 'f1')
) )
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
print("Acc", np.mean(dummy_classifier['test_accuracy'])) print("Acc (median)", np.nanmedian(dummy_classifier['test_accuracy']))
print("Precision", np.mean(dummy_classifier['test_average_precision'])) print("Acc (mean)", np.mean(dummy_classifier['test_accuracy']))
print("Precision", np.mean(dummy_classifier['test_precision']))
print("Recall", np.mean(dummy_classifier['test_recall'])) print("Recall", np.mean(dummy_classifier['test_recall']))
print("F1", np.mean(dummy_classifier['test_f1'])) print("F1", np.mean(dummy_classifier['test_f1']))
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1]) print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
@ -153,7 +164,8 @@ log_reg_scores = cross_validate(
scoring=('accuracy', 'precision', 'recall', 'f1') scoring=('accuracy', 'precision', 'recall', 'f1')
) )
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
print("Acc", np.mean(log_reg_scores['test_accuracy'])) print("Acc (median)", np.nanmedian(log_reg_scores['test_accuracy']))
print("Acc (mean)", np.mean(log_reg_scores['test_accuracy']))
print("Precision", np.mean(log_reg_scores['test_precision'])) print("Precision", np.mean(log_reg_scores['test_precision']))
print("Recall", np.mean(log_reg_scores['test_recall'])) print("Recall", np.mean(log_reg_scores['test_recall']))
print("F1", np.mean(log_reg_scores['test_f1'])) print("F1", np.mean(log_reg_scores['test_f1']))
@ -177,7 +189,8 @@ svc_scores = cross_validate(
scoring=('accuracy', 'precision', 'recall', 'f1') scoring=('accuracy', 'precision', 'recall', 'f1')
) )
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
print("Acc", np.mean(svc_scores['test_accuracy'])) print("Acc (median)", np.nanmedian(svc_scores['test_accuracy']))
print("Acc (mean)", np.mean(svc_scores['test_accuracy']))
print("Precision", np.mean(svc_scores['test_precision'])) print("Precision", np.mean(svc_scores['test_precision']))
print("Recall", np.mean(svc_scores['test_recall'])) print("Recall", np.mean(svc_scores['test_recall']))
print("F1", np.mean(svc_scores['test_f1'])) print("F1", np.mean(svc_scores['test_f1']))
@ -202,7 +215,8 @@ gaussian_nb_scores = cross_validate(
scoring=('accuracy', 'precision', 'recall', 'f1') scoring=('accuracy', 'precision', 'recall', 'f1')
) )
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
print("Acc", np.mean(gaussian_nb_scores['test_accuracy'])) print("Acc (median)", np.nanmedian(gaussian_nb_scores['test_accuracy']))
print("Acc (mean)", np.mean(gaussian_nb_scores['test_accuracy']))
print("Precision", np.mean(gaussian_nb_scores['test_precision'])) print("Precision", np.mean(gaussian_nb_scores['test_precision']))
print("Recall", np.mean(gaussian_nb_scores['test_recall'])) print("Recall", np.mean(gaussian_nb_scores['test_recall']))
print("F1", np.mean(gaussian_nb_scores['test_f1'])) print("F1", np.mean(gaussian_nb_scores['test_f1']))
@ -227,7 +241,8 @@ sgdc_scores = cross_validate(
scoring=('accuracy', 'precision', 'recall', 'f1') scoring=('accuracy', 'precision', 'recall', 'f1')
) )
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
print("Acc", np.mean(sgdc_scores['test_accuracy'])) print("Acc (median)", np.nanmedian(sgdc_scores['test_accuracy']))
print("Acc (mean)", np.mean(sgdc_scores['test_accuracy']))
print("Precision", np.mean(sgdc_scores['test_precision'])) print("Precision", np.mean(sgdc_scores['test_precision']))
print("Recall", np.mean(sgdc_scores['test_recall'])) print("Recall", np.mean(sgdc_scores['test_recall']))
print("F1", np.mean(sgdc_scores['test_f1'])) print("F1", np.mean(sgdc_scores['test_f1']))
@ -252,7 +267,8 @@ knn_scores = cross_validate(
scoring=('accuracy', 'precision', 'recall', 'f1') scoring=('accuracy', 'precision', 'recall', 'f1')
) )
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
print("Acc", np.mean(knn_scores['test_accuracy'])) print("Acc (median)", np.nanmedian(knn_scores['test_accuracy']))
print("Acc (mean)", np.mean(knn_scores['test_accuracy']))
print("Precision", np.mean(knn_scores['test_precision'])) print("Precision", np.mean(knn_scores['test_precision']))
print("Recall", np.mean(knn_scores['test_recall'])) print("Recall", np.mean(knn_scores['test_recall']))
print("F1", np.mean(knn_scores['test_f1'])) print("F1", np.mean(knn_scores['test_f1']))
@ -277,7 +293,8 @@ dtree_scores = cross_validate(
scoring=('accuracy', 'precision', 'recall', 'f1') scoring=('accuracy', 'precision', 'recall', 'f1')
) )
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
print("Acc", np.mean(dtree_scores['test_accuracy'])) print("Acc (median)", np.nanmedian(dtree_scores['test_accuracy']))
print("Acc (mean)", np.mean(dtree_scores['test_accuracy']))
print("Precision", np.mean(dtree_scores['test_precision'])) print("Precision", np.mean(dtree_scores['test_precision']))
print("Recall", np.mean(dtree_scores['test_recall'])) print("Recall", np.mean(dtree_scores['test_recall']))
print("F1", np.mean(dtree_scores['test_f1'])) print("F1", np.mean(dtree_scores['test_f1']))
@ -299,16 +316,40 @@ rfc_scores = cross_validate(
cv=cv_method, cv=cv_method,
n_jobs=-1, n_jobs=-1,
error_score='raise', error_score='raise',
scoring=('accuracy', 'precision', 'recall', 'f1') scoring=('accuracy', 'precision', 'recall', 'f1'),
return_estimator=True
) )
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
print("Acc", np.mean(rfc_scores['test_accuracy'])) print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy']))
print("Acc (mean)", np.mean(rfc_scores['test_accuracy']))
print("Precision", np.mean(rfc_scores['test_precision'])) print("Precision", np.mean(rfc_scores['test_precision']))
print("Recall", np.mean(rfc_scores['test_recall'])) print("Recall", np.mean(rfc_scores['test_recall']))
print("F1", np.mean(rfc_scores['test_f1'])) print("F1", np.mean(rfc_scores['test_f1']))
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1]) print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl])) print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl]))
# %% [markdown]
# ### Feature importance (RFC)
# %% jupyter={"source_hidden": true}
rfc_es_fimp = pd.DataFrame(columns=list(train_x.columns))
for idx, estimator in enumerate(rfc_scores['estimator']):
feature_importances = pd.DataFrame(estimator.feature_importances_,
index = list(train_x.columns),
columns=['importance'])
# print("\nFeatures sorted by their score for estimator {}:".format(idx))
# print(feature_importances.sort_values('importance', ascending=False).head(10))
rfc_es_fimp = pd.concat([rfc_es_fimp, feature_importances]).groupby(level=0).mean()
pd.set_option('display.max_rows', 100)
print(rfc_es_fimp.sort_values('importance', ascending=False).head(30))
rfc_es_fimp.sort_values('importance', ascending=False).head(30).plot.bar()
rfc_es_fimp.sort_values('importance', ascending=False).tail(30).plot.bar()
train_x['empatica_temperature_cr_stdDev_X_SO_mean'].value_counts()
# %% [markdown] # %% [markdown]
# ### Gradient Boosting Classifier # ### Gradient Boosting Classifier
@ -327,7 +368,8 @@ gbc_scores = cross_validate(
scoring=('accuracy', 'precision', 'recall', 'f1') scoring=('accuracy', 'precision', 'recall', 'f1')
) )
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
print("Acc", np.mean(gbc_scores['test_accuracy'])) print("Acc (median)", np.nanmedian(gbc_scores['test_accuracy']))
print("Acc (mean)", np.mean(gbc_scores['test_accuracy']))
print("Precision", np.mean(gbc_scores['test_precision'])) print("Precision", np.mean(gbc_scores['test_precision']))
print("Recall", np.mean(gbc_scores['test_recall'])) print("Recall", np.mean(gbc_scores['test_recall']))
print("F1", np.mean(gbc_scores['test_f1'])) print("F1", np.mean(gbc_scores['test_f1']))
@ -352,7 +394,8 @@ lgbm_scores = cross_validate(
scoring=('accuracy', 'precision', 'recall', 'f1') scoring=('accuracy', 'precision', 'recall', 'f1')
) )
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
print("Acc", np.mean(lgbm_scores['test_accuracy'])) print("Acc (median)", np.nanmedian(lgbm_scores['test_accuracy']))
print("Acc (mean)", np.mean(lgbm_scores['test_accuracy']))
print("Precision", np.mean(lgbm_scores['test_precision'])) print("Precision", np.mean(lgbm_scores['test_precision']))
print("Recall", np.mean(lgbm_scores['test_recall'])) print("Recall", np.mean(lgbm_scores['test_recall']))
print("F1", np.mean(lgbm_scores['test_f1'])) print("F1", np.mean(lgbm_scores['test_f1']))
@ -377,9 +420,12 @@ xgb_classifier_scores = cross_validate(
scoring=('accuracy', 'precision', 'recall', 'f1') scoring=('accuracy', 'precision', 'recall', 'f1')
) )
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
print("Acc", np.mean(xgb_classifier_scores['test_accuracy'])) print("Acc (median)", np.nanmedian(xgb_classifier_scores['test_accuracy']))
print("Acc (mean)", np.mean(xgb_classifier_scores['test_accuracy']))
print("Precision", np.mean(xgb_classifier_scores['test_precision'])) print("Precision", np.mean(xgb_classifier_scores['test_precision']))
print("Recall", np.mean(xgb_classifier_scores['test_recall'])) print("Recall", np.mean(xgb_classifier_scores['test_recall']))
print("F1", np.mean(xgb_classifier_scores['test_f1'])) print("F1", np.mean(xgb_classifier_scores['test_f1']))
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1]) print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])) print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
# %%

View File

@ -76,7 +76,6 @@ model_input[clust_col].describe()
# Filter-out outlier rows by clust_col # Filter-out outlier rows by clust_col
#model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)] #model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
#print(model_input)
uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True) uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
uniq = uniq.dropna() uniq = uniq.dropna()
plt.bar(uniq['pid'], uniq[clust_col]) plt.bar(uniq['pid'], uniq[clust_col])
@ -110,7 +109,7 @@ for k in range(n_clusters):
model_input_subset['target'].value_counts() model_input_subset['target'].value_counts()
if cv_method_str == 'halflogo': if cv_method_str == 'half_logo':
model_input_subset['pid_index'] = model_input_subset.groupby('pid').cumcount() model_input_subset['pid_index'] = model_input_subset.groupby('pid').cumcount()
model_input_subset['pid_count'] = model_input_subset.groupby('pid')['pid'].transform('count') model_input_subset['pid_count'] = model_input_subset.groupby('pid')['pid'].transform('count')
@ -141,7 +140,7 @@ for k in range(n_clusters):
train_x = pd.concat([numerical_features, categorical_features], axis=1) train_x = pd.concat([numerical_features, categorical_features], axis=1)
# Establish cv method # Establish cv method
cv_method = None # Defaults to 5 k-folds in cross_validate method cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
if cv_method_str == 'logo' or cv_method_str == 'half_logo': if cv_method_str == 'logo' or cv_method_str == 'half_logo':
cv_method = LeaveOneGroupOut() cv_method = LeaveOneGroupOut()
cv_method.get_n_splits( cv_method.get_n_splits(
@ -183,14 +182,3 @@ for k in range(n_clusters):
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
# Get overall results # Get overall results
cm.get_total_models_scores(n_clusters=n_clusters) cm.get_total_models_scores(n_clusters=n_clusters)
# %%
final_scores = pd.DataFrame()
for model in cmodels:
final_scores = pd.concat([final_scores,pd.DataFrame.from_dict(cmodels[model])])
# %%
final_scores
# %%

View File

@ -15,26 +15,18 @@
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
# %matplotlib inline # %matplotlib inline
import datetime
import importlib
import os import os
import sys import sys
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
import seaborn as sns
from scipy import stats from scipy import stats
from sklearn.model_selection import LeaveOneGroupOut, cross_validate, train_test_split from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.dummy import DummyClassifier
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
from lightgbm import LGBMClassifier
import xgboost as xg
from sklearn.cluster import KMeans from sklearn.cluster import KMeans
from IPython.core.interactiveshell import InteractiveShell from IPython.core.interactiveshell import InteractiveShell
@ -44,8 +36,6 @@ nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path: if nb_dir not in sys.path:
sys.path.append(nb_dir) sys.path.append(nb_dir)
import machine_learning.labels
import machine_learning.model
from machine_learning.classification_models import ClassificationModels from machine_learning.classification_models import ClassificationModels
# %% [markdown] # %% [markdown]

View File

@ -47,6 +47,7 @@ import machine_learning.helper
# %% tags=["active-ipynb"] # %% tags=["active-ipynb"]
# filename = Path("E:/STRAWresults/inputData/stressfulness_event/input_appraisal_stressfulness_event_mean.csv") # filename = Path("E:/STRAWresults/inputData/stressfulness_event/input_appraisal_stressfulness_event_mean.csv")
# filename = Path('C:/Users/Primoz/VSCodeProjects/straw2analysis/data/stressfulness_event/input_appraisal_stressfulness_event_mean.csv')
# %% # %%
final_scores = machine_learning.helper.run_all_regression_models(filename) final_scores = machine_learning.helper.run_all_regression_models(filename)