Merge branch 'ml_pipeline' of repo.ijs.si:junoslukan/straw2analysis into ml_pipeline
# Conflicts: # .gitignore # exploration/ml_pipeline_classification_with_clustering.pyml_pipeline
commit
8bbe0b2ba8
|
@ -12,6 +12,7 @@ __pycache__/
|
||||||
/data/*input*.csv
|
/data/*input*.csv
|
||||||
/data/daily*
|
/data/daily*
|
||||||
/data/intradaily*
|
/data/intradaily*
|
||||||
|
/data/stressfulness_event*
|
||||||
/data/30min*
|
/data/30min*
|
||||||
/presentation/*scores.csv
|
/presentation/*scores.csv
|
||||||
/presentation/Results.ods
|
/presentation/Results.ods
|
||||||
|
|
|
@ -7,6 +7,7 @@ dependencies:
|
||||||
- black
|
- black
|
||||||
- isort
|
- isort
|
||||||
- flake8
|
- flake8
|
||||||
|
- imbalanced-learn=0.10.0
|
||||||
- jupyterlab
|
- jupyterlab
|
||||||
- jupytext
|
- jupytext
|
||||||
- lightgbm
|
- lightgbm
|
||||||
|
|
|
@ -0,0 +1,166 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# ---
|
||||||
|
# jupyter:
|
||||||
|
# jupytext:
|
||||||
|
# formats: ipynb,py:percent
|
||||||
|
# text_representation:
|
||||||
|
# extension: .py
|
||||||
|
# format_name: percent
|
||||||
|
# format_version: '1.3'
|
||||||
|
# jupytext_version: 1.13.0
|
||||||
|
# kernelspec:
|
||||||
|
# display_name: straw2analysis
|
||||||
|
# language: python
|
||||||
|
# name: straw2analysis
|
||||||
|
# ---
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import datetime
|
||||||
|
import math
|
||||||
|
|
||||||
|
import seaborn as sns
|
||||||
|
|
||||||
|
nb_dir = os.path.split(os.getcwd())[0]
|
||||||
|
if nb_dir not in sys.path:
|
||||||
|
sys.path.append(nb_dir)
|
||||||
|
import participants.query_db
|
||||||
|
from features.esm import *
|
||||||
|
from features.esm_JCQ import *
|
||||||
|
from features.esm_SAM import *
|
||||||
|
|
||||||
|
from IPython.core.interactiveshell import InteractiveShell
|
||||||
|
InteractiveShell.ast_node_interactivity = "all"
|
||||||
|
|
||||||
|
# %%
|
||||||
|
participants_inactive_usernames = participants.query_db.get_usernames(
|
||||||
|
collection_start=datetime.date.fromisoformat("2020-08-01")
|
||||||
|
)
|
||||||
|
df_esm_inactive = get_esm_data(participants_inactive_usernames)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df_esm_preprocessed = preprocess_esm(df_esm_inactive)
|
||||||
|
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# Investigate stressfulness events
|
||||||
|
# %%
|
||||||
|
extracted_ers = df_esm_preprocessed.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index().rename(columns={'timestamp': 'session_length'}) # questionnaire length
|
||||||
|
extracted_ers = extracted_ers[extracted_ers["session_length"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire answering is 15 min
|
||||||
|
session_start_timestamp = df_esm_preprocessed.groupby(['device_id', 'esm_session'])['timestamp'].min().to_frame().rename(columns={'timestamp': 'session_start_timestamp'}) # questionnaire start timestamp
|
||||||
|
session_end_timestamp = df_esm_preprocessed.groupby(['device_id', 'esm_session'])['timestamp'].max().to_frame().rename(columns={'timestamp': 'session_end_timestamp'}) # questionnaire end timestamp
|
||||||
|
se_time = df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 90.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_time'})
|
||||||
|
se_duration = df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 91.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_duration'})
|
||||||
|
|
||||||
|
# Make se_durations to the appropriate lengths
|
||||||
|
|
||||||
|
# Extracted 3 targets that will be transfered in the csv file to the cleaning script.
|
||||||
|
df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 87.].columns
|
||||||
|
se_stressfulness_event_tg = df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'appraisal_stressfulness_event'})
|
||||||
|
|
||||||
|
# All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count)
|
||||||
|
extracted_ers = extracted_ers.join(session_start_timestamp, on=['device_id', 'esm_session'], how='inner') \
|
||||||
|
.join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \
|
||||||
|
.join(se_stressfulness_event_tg, on=['device_id', 'esm_session'], how='inner') \
|
||||||
|
.join(se_time, on=['device_id', 'esm_session'], how='left') \
|
||||||
|
.join(se_duration, on=['device_id', 'esm_session'], how='left') \
|
||||||
|
|
||||||
|
# Filter-out the sessions that are not useful. Because of the ambiguity this excludes:
|
||||||
|
# (1) straw event times that are marked as "0 - I don't remember"
|
||||||
|
# (2) straw event durations that are marked as "0 - I don't remember"
|
||||||
|
extracted_ers = extracted_ers[(~extracted_ers.se_time.astype(str).str.startswith("0 - ")) & (~extracted_ers.se_duration.astype(str).str.startswith("0 - ")) & (~extracted_ers.se_duration.astype(str).str.startswith("Removed "))]
|
||||||
|
extracted_ers.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
|
# Add default duration in case if participant answered that no stressful event occured
|
||||||
|
|
||||||
|
# Prepare data to fit the data structure in the CSV file ...
|
||||||
|
# Add the event time as the start of the questionnaire if no stress event occured
|
||||||
|
extracted_ers['se_time'] = extracted_ers['se_time'].fillna(extracted_ers['session_start_timestamp'])
|
||||||
|
# Type could be an int (timestamp [ms]) which stays the same, and datetime str which is converted to timestamp in miliseconds
|
||||||
|
extracted_ers['event_timestamp'] = extracted_ers['se_time'].apply(lambda x: x if isinstance(x, int) else pd.to_datetime(x).timestamp() * 1000).astype('int64')
|
||||||
|
extracted_ers['shift_direction'] = -1
|
||||||
|
|
||||||
|
""">>>>> begin section (could be optimized) <<<<<"""
|
||||||
|
|
||||||
|
# Checks whether the duration is marked with "1 - It's still ongoing" which means that the end of the current questionnaire
|
||||||
|
# is taken as end time of the segment. Else the user input duration is taken.
|
||||||
|
extracted_ers['temp_duration'] = extracted_ers['se_duration']
|
||||||
|
extracted_ers['se_duration'] = \
|
||||||
|
np.where(
|
||||||
|
extracted_ers['se_duration'].astype(str).str.startswith("1 - "),
|
||||||
|
extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'],
|
||||||
|
extracted_ers['se_duration']
|
||||||
|
)
|
||||||
|
|
||||||
|
# This converts the rows of timestamps in miliseconds and the rows with datetime... to timestamp in seconds.
|
||||||
|
|
||||||
|
extracted_ers['se_duration'] = \
|
||||||
|
extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else abs(pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60)
|
||||||
|
|
||||||
|
# Check whether min se_duration is at least the same duration as the ioi. Filter-out the rest.
|
||||||
|
|
||||||
|
""">>>>> end section <<<<<"""
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# Count negative values of duration
|
||||||
|
print("Count all:", extracted_ers[['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
|
||||||
|
print("Count stressed:", extracted_ers[(~extracted_ers['se_duration'].isna())][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
|
||||||
|
print("Count negative durations (invalid se_time user input):", extracted_ers[extracted_ers['se_duration'] < 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
|
||||||
|
print("Count 0 durations:", extracted_ers[extracted_ers['se_duration'] == 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
|
||||||
|
extracted_ers[extracted_ers['se_duration'] <= 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0]
|
||||||
|
extracted_ers[(~extracted_ers['se_duration'].isna()) & (extracted_ers['se_duration'] <= 0)][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']]
|
||||||
|
|
||||||
|
ax = extracted_ers.hist(column='se_duration', bins='auto', grid=False, figsize=(12,8), color='#86bf91', zorder=2, rwidth=0.9)
|
||||||
|
hist, bin_edges = np.histogram(extracted_ers['se_duration'].dropna())
|
||||||
|
hist
|
||||||
|
bin_edges
|
||||||
|
|
||||||
|
extracted_ers = extracted_ers[extracted_ers['se_duration'] >= 0]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# bins = [-100000000, 0, 0.0000001, 1200, 7200, 100000000] #'neg', 'zero', '<20min', '2h', 'high_pos' ..... right=False
|
||||||
|
bins = [-100000000, -0.0000001, 0, 300, 600, 1200, 3600, 7200, 14400, 1000000000] # 'neg', 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'
|
||||||
|
|
||||||
|
extracted_ers['bins'], edges = pd.cut(extracted_ers.se_duration, bins=bins, labels=['neg', 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'], retbins=True, right=True) #['low', 'medium', 'high']
|
||||||
|
sns.displot(
|
||||||
|
data=extracted_ers.dropna(),
|
||||||
|
x="bins",
|
||||||
|
binwidth=0.1,
|
||||||
|
)
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
extracted_ers[extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'] >= 0]
|
||||||
|
extracted_ers['se_time'].value_counts()
|
||||||
|
pd.set_option('display.max_rows', 100)
|
||||||
|
# Tukaj nas zanima, koliko so oddaljeni časi stresnega dogodka od konca vprašalnika.
|
||||||
|
extracted_ers = extracted_ers[~extracted_ers['se_duration'].isna()] # Remove no stress events
|
||||||
|
extracted_ers['diff_se_time_session_end'] = (extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'])
|
||||||
|
|
||||||
|
print("Count all:", extracted_ers[['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']].shape[0])
|
||||||
|
print("Count negative durations:", extracted_ers[extracted_ers['diff_se_time_session_end'] < 0][['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']])
|
||||||
|
print("Count 0 durations:", extracted_ers[extracted_ers['diff_se_time_session_end'] == 0][['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']].shape[0])
|
||||||
|
|
||||||
|
extracted_ers[extracted_ers['diff_se_time_session_end'] < 0]['diff_se_time_session_end']
|
||||||
|
# extracted_ers = extracted_ers[(extracted_ers['diff_se_time_session_end'] > 0)]
|
||||||
|
bins2 = [-100000, 0, 300, 600, 1200, 3600, 7200, 14400, 1000000000] # 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'
|
||||||
|
extracted_ers['bins2'], edges = pd.cut(extracted_ers.diff_se_time_session_end, bins=bins2, labels=['neg_zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'], retbins=True, right=True) #['low', 'medium', 'high']
|
||||||
|
extracted_ers['bins2']
|
||||||
|
sns.displot(
|
||||||
|
data=extracted_ers.dropna(),
|
||||||
|
x="bins2",
|
||||||
|
binwidth=0.1,
|
||||||
|
)
|
||||||
|
|
||||||
|
extracted_ers.shape
|
||||||
|
extracted_ers.dropna().shape
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
extracted_ers['appraisal_stressfulness_event_num'] = extracted_ers['appraisal_stressfulness_event'].str[0].astype(int)
|
||||||
|
print("duration-target (corr):", extracted_ers['se_duration'].corr(extracted_ers['appraisal_stressfulness_event_num']))
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# Explore groupby participants?
|
|
@ -15,18 +15,15 @@
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
# %matplotlib inline
|
# %matplotlib inline
|
||||||
import datetime
|
|
||||||
import importlib
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import seaborn as sns
|
|
||||||
|
|
||||||
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
|
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
|
||||||
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
|
from sklearn.model_selection import LeaveOneGroupOut, cross_validate, StratifiedKFold
|
||||||
from sklearn.dummy import DummyClassifier
|
from sklearn.dummy import DummyClassifier
|
||||||
from sklearn.impute import SimpleImputer
|
from sklearn.impute import SimpleImputer
|
||||||
|
|
||||||
|
@ -39,19 +36,18 @@ nb_dir = os.path.split(os.getcwd())[0]
|
||||||
if nb_dir not in sys.path:
|
if nb_dir not in sys.path:
|
||||||
sys.path.append(nb_dir)
|
sys.path.append(nb_dir)
|
||||||
|
|
||||||
import machine_learning.labels
|
|
||||||
import machine_learning.model
|
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# # RAPIDS models
|
# # RAPIDS models
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# ## Set script's parameters
|
# ## Set script's parameters
|
||||||
cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
|
cv_method_str = '5kfold' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
|
||||||
n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
|
n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||||
|
undersampling = True # (bool) If True this will train and test data on balanced dataset (using undersampling method)
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
model_input = pd.read_csv("../data/stressfulness_event_nonstandardized/input_appraisal_stressfulness_event_mean.csv")
|
model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv")
|
||||||
|
# model_input = model_input[model_input.columns.drop(list(model_input.filter(regex='empatica_temperature')))]
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||||
|
@ -59,8 +55,8 @@ model_input.set_index(index_columns, inplace=True)
|
||||||
model_input['target'].value_counts()
|
model_input['target'].value_counts()
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
# bins = [-10, -1, 1, 10] # bins for z-scored targets
|
# bins = [-10, 0, 10] # bins for z-scored targets
|
||||||
bins = [0, 1, 4] # bins for stressfulness (1-4) target
|
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
|
||||||
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
|
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
|
||||||
model_input['target'].value_counts(), edges
|
model_input['target'].value_counts(), edges
|
||||||
# model_input = model_input[model_input['target'] != "medium"]
|
# model_input = model_input[model_input['target'] != "medium"]
|
||||||
|
@ -68,7 +64,21 @@ model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x
|
||||||
|
|
||||||
model_input['target'].value_counts()
|
model_input['target'].value_counts()
|
||||||
|
|
||||||
if cv_method_str == 'halflogo':
|
# %% jupyter={"source_hidden": true}
|
||||||
|
# UnderSampling
|
||||||
|
if undersampling:
|
||||||
|
model_input.groupby("pid").count()
|
||||||
|
no_stress = model_input[model_input['target'] == 0]
|
||||||
|
stress = model_input[model_input['target'] == 1]
|
||||||
|
|
||||||
|
no_stress = no_stress.sample(n=len(stress))
|
||||||
|
model_input = pd.concat([stress,no_stress], axis=0)
|
||||||
|
|
||||||
|
model_input["target"].value_counts()
|
||||||
|
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
if cv_method_str == 'half_logo':
|
||||||
model_input['pid_index'] = model_input.groupby('pid').cumcount()
|
model_input['pid_index'] = model_input.groupby('pid').cumcount()
|
||||||
model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
|
model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
|
||||||
|
|
||||||
|
@ -101,7 +111,7 @@ train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
||||||
train_x.dtypes
|
train_x.dtypes
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
cv_method = None # Defaults to 5 k-folds in cross_validate method
|
cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
|
||||||
if cv_method_str == 'logo' or cv_method_str == 'half_logo':
|
if cv_method_str == 'logo' or cv_method_str == 'half_logo':
|
||||||
cv_method = LeaveOneGroupOut()
|
cv_method = LeaveOneGroupOut()
|
||||||
cv_method.get_n_splits(
|
cv_method.get_n_splits(
|
||||||
|
@ -126,11 +136,12 @@ dummy_classifier = cross_validate(
|
||||||
cv=cv_method,
|
cv=cv_method,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
error_score='raise',
|
error_score='raise',
|
||||||
scoring=('accuracy', 'average_precision', 'recall', 'f1')
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
print("Acc", np.mean(dummy_classifier['test_accuracy']))
|
print("Acc (median)", np.nanmedian(dummy_classifier['test_accuracy']))
|
||||||
print("Precision", np.mean(dummy_classifier['test_average_precision']))
|
print("Acc (mean)", np.mean(dummy_classifier['test_accuracy']))
|
||||||
|
print("Precision", np.mean(dummy_classifier['test_precision']))
|
||||||
print("Recall", np.mean(dummy_classifier['test_recall']))
|
print("Recall", np.mean(dummy_classifier['test_recall']))
|
||||||
print("F1", np.mean(dummy_classifier['test_f1']))
|
print("F1", np.mean(dummy_classifier['test_f1']))
|
||||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
|
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||||
|
@ -153,7 +164,8 @@ log_reg_scores = cross_validate(
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
print("Acc", np.mean(log_reg_scores['test_accuracy']))
|
print("Acc (median)", np.nanmedian(log_reg_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(log_reg_scores['test_accuracy']))
|
||||||
print("Precision", np.mean(log_reg_scores['test_precision']))
|
print("Precision", np.mean(log_reg_scores['test_precision']))
|
||||||
print("Recall", np.mean(log_reg_scores['test_recall']))
|
print("Recall", np.mean(log_reg_scores['test_recall']))
|
||||||
print("F1", np.mean(log_reg_scores['test_f1']))
|
print("F1", np.mean(log_reg_scores['test_f1']))
|
||||||
|
@ -177,7 +189,8 @@ svc_scores = cross_validate(
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
print("Acc", np.mean(svc_scores['test_accuracy']))
|
print("Acc (median)", np.nanmedian(svc_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(svc_scores['test_accuracy']))
|
||||||
print("Precision", np.mean(svc_scores['test_precision']))
|
print("Precision", np.mean(svc_scores['test_precision']))
|
||||||
print("Recall", np.mean(svc_scores['test_recall']))
|
print("Recall", np.mean(svc_scores['test_recall']))
|
||||||
print("F1", np.mean(svc_scores['test_f1']))
|
print("F1", np.mean(svc_scores['test_f1']))
|
||||||
|
@ -202,7 +215,8 @@ gaussian_nb_scores = cross_validate(
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
print("Acc", np.mean(gaussian_nb_scores['test_accuracy']))
|
print("Acc (median)", np.nanmedian(gaussian_nb_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(gaussian_nb_scores['test_accuracy']))
|
||||||
print("Precision", np.mean(gaussian_nb_scores['test_precision']))
|
print("Precision", np.mean(gaussian_nb_scores['test_precision']))
|
||||||
print("Recall", np.mean(gaussian_nb_scores['test_recall']))
|
print("Recall", np.mean(gaussian_nb_scores['test_recall']))
|
||||||
print("F1", np.mean(gaussian_nb_scores['test_f1']))
|
print("F1", np.mean(gaussian_nb_scores['test_f1']))
|
||||||
|
@ -227,7 +241,8 @@ sgdc_scores = cross_validate(
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
print("Acc", np.mean(sgdc_scores['test_accuracy']))
|
print("Acc (median)", np.nanmedian(sgdc_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(sgdc_scores['test_accuracy']))
|
||||||
print("Precision", np.mean(sgdc_scores['test_precision']))
|
print("Precision", np.mean(sgdc_scores['test_precision']))
|
||||||
print("Recall", np.mean(sgdc_scores['test_recall']))
|
print("Recall", np.mean(sgdc_scores['test_recall']))
|
||||||
print("F1", np.mean(sgdc_scores['test_f1']))
|
print("F1", np.mean(sgdc_scores['test_f1']))
|
||||||
|
@ -252,7 +267,8 @@ knn_scores = cross_validate(
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
print("Acc", np.mean(knn_scores['test_accuracy']))
|
print("Acc (median)", np.nanmedian(knn_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(knn_scores['test_accuracy']))
|
||||||
print("Precision", np.mean(knn_scores['test_precision']))
|
print("Precision", np.mean(knn_scores['test_precision']))
|
||||||
print("Recall", np.mean(knn_scores['test_recall']))
|
print("Recall", np.mean(knn_scores['test_recall']))
|
||||||
print("F1", np.mean(knn_scores['test_f1']))
|
print("F1", np.mean(knn_scores['test_f1']))
|
||||||
|
@ -277,7 +293,8 @@ dtree_scores = cross_validate(
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
print("Acc", np.mean(dtree_scores['test_accuracy']))
|
print("Acc (median)", np.nanmedian(dtree_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(dtree_scores['test_accuracy']))
|
||||||
print("Precision", np.mean(dtree_scores['test_precision']))
|
print("Precision", np.mean(dtree_scores['test_precision']))
|
||||||
print("Recall", np.mean(dtree_scores['test_recall']))
|
print("Recall", np.mean(dtree_scores['test_recall']))
|
||||||
print("F1", np.mean(dtree_scores['test_f1']))
|
print("F1", np.mean(dtree_scores['test_f1']))
|
||||||
|
@ -299,16 +316,40 @@ rfc_scores = cross_validate(
|
||||||
cv=cv_method,
|
cv=cv_method,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
error_score='raise',
|
error_score='raise',
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=('accuracy', 'precision', 'recall', 'f1'),
|
||||||
|
return_estimator=True
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
print("Acc", np.mean(rfc_scores['test_accuracy']))
|
print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(rfc_scores['test_accuracy']))
|
||||||
print("Precision", np.mean(rfc_scores['test_precision']))
|
print("Precision", np.mean(rfc_scores['test_precision']))
|
||||||
print("Recall", np.mean(rfc_scores['test_recall']))
|
print("Recall", np.mean(rfc_scores['test_recall']))
|
||||||
print("F1", np.mean(rfc_scores['test_f1']))
|
print("F1", np.mean(rfc_scores['test_f1']))
|
||||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl]))
|
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### Feature importance (RFC)
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
rfc_es_fimp = pd.DataFrame(columns=list(train_x.columns))
|
||||||
|
for idx, estimator in enumerate(rfc_scores['estimator']):
|
||||||
|
feature_importances = pd.DataFrame(estimator.feature_importances_,
|
||||||
|
index = list(train_x.columns),
|
||||||
|
columns=['importance'])
|
||||||
|
# print("\nFeatures sorted by their score for estimator {}:".format(idx))
|
||||||
|
# print(feature_importances.sort_values('importance', ascending=False).head(10))
|
||||||
|
rfc_es_fimp = pd.concat([rfc_es_fimp, feature_importances]).groupby(level=0).mean()
|
||||||
|
|
||||||
|
pd.set_option('display.max_rows', 100)
|
||||||
|
print(rfc_es_fimp.sort_values('importance', ascending=False).head(30))
|
||||||
|
|
||||||
|
rfc_es_fimp.sort_values('importance', ascending=False).head(30).plot.bar()
|
||||||
|
|
||||||
|
rfc_es_fimp.sort_values('importance', ascending=False).tail(30).plot.bar()
|
||||||
|
|
||||||
|
train_x['empatica_temperature_cr_stdDev_X_SO_mean'].value_counts()
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# ### Gradient Boosting Classifier
|
# ### Gradient Boosting Classifier
|
||||||
|
|
||||||
|
@ -327,7 +368,8 @@ gbc_scores = cross_validate(
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
print("Acc", np.mean(gbc_scores['test_accuracy']))
|
print("Acc (median)", np.nanmedian(gbc_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(gbc_scores['test_accuracy']))
|
||||||
print("Precision", np.mean(gbc_scores['test_precision']))
|
print("Precision", np.mean(gbc_scores['test_precision']))
|
||||||
print("Recall", np.mean(gbc_scores['test_recall']))
|
print("Recall", np.mean(gbc_scores['test_recall']))
|
||||||
print("F1", np.mean(gbc_scores['test_f1']))
|
print("F1", np.mean(gbc_scores['test_f1']))
|
||||||
|
@ -352,7 +394,8 @@ lgbm_scores = cross_validate(
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
print("Acc", np.mean(lgbm_scores['test_accuracy']))
|
print("Acc (median)", np.nanmedian(lgbm_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(lgbm_scores['test_accuracy']))
|
||||||
print("Precision", np.mean(lgbm_scores['test_precision']))
|
print("Precision", np.mean(lgbm_scores['test_precision']))
|
||||||
print("Recall", np.mean(lgbm_scores['test_recall']))
|
print("Recall", np.mean(lgbm_scores['test_recall']))
|
||||||
print("F1", np.mean(lgbm_scores['test_f1']))
|
print("F1", np.mean(lgbm_scores['test_f1']))
|
||||||
|
@ -377,9 +420,12 @@ xgb_classifier_scores = cross_validate(
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
print("Acc", np.mean(xgb_classifier_scores['test_accuracy']))
|
print("Acc (median)", np.nanmedian(xgb_classifier_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(xgb_classifier_scores['test_accuracy']))
|
||||||
print("Precision", np.mean(xgb_classifier_scores['test_precision']))
|
print("Precision", np.mean(xgb_classifier_scores['test_precision']))
|
||||||
print("Recall", np.mean(xgb_classifier_scores['test_recall']))
|
print("Recall", np.mean(xgb_classifier_scores['test_recall']))
|
||||||
print("F1", np.mean(xgb_classifier_scores['test_f1']))
|
print("F1", np.mean(xgb_classifier_scores['test_f1']))
|
||||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
|
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
|
@ -76,7 +76,6 @@ model_input[clust_col].describe()
|
||||||
# Filter-out outlier rows by clust_col
|
# Filter-out outlier rows by clust_col
|
||||||
#model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
|
#model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
|
||||||
|
|
||||||
#print(model_input)
|
|
||||||
uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
|
uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
|
||||||
uniq = uniq.dropna()
|
uniq = uniq.dropna()
|
||||||
plt.bar(uniq['pid'], uniq[clust_col])
|
plt.bar(uniq['pid'], uniq[clust_col])
|
||||||
|
@ -110,7 +109,7 @@ for k in range(n_clusters):
|
||||||
|
|
||||||
model_input_subset['target'].value_counts()
|
model_input_subset['target'].value_counts()
|
||||||
|
|
||||||
if cv_method_str == 'halflogo':
|
if cv_method_str == 'half_logo':
|
||||||
model_input_subset['pid_index'] = model_input_subset.groupby('pid').cumcount()
|
model_input_subset['pid_index'] = model_input_subset.groupby('pid').cumcount()
|
||||||
model_input_subset['pid_count'] = model_input_subset.groupby('pid')['pid'].transform('count')
|
model_input_subset['pid_count'] = model_input_subset.groupby('pid')['pid'].transform('count')
|
||||||
|
|
||||||
|
@ -141,7 +140,7 @@ for k in range(n_clusters):
|
||||||
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
||||||
|
|
||||||
# Establish cv method
|
# Establish cv method
|
||||||
cv_method = None # Defaults to 5 k-folds in cross_validate method
|
cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
|
||||||
if cv_method_str == 'logo' or cv_method_str == 'half_logo':
|
if cv_method_str == 'logo' or cv_method_str == 'half_logo':
|
||||||
cv_method = LeaveOneGroupOut()
|
cv_method = LeaveOneGroupOut()
|
||||||
cv_method.get_n_splits(
|
cv_method.get_n_splits(
|
||||||
|
@ -183,14 +182,3 @@ for k in range(n_clusters):
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
# Get overall results
|
# Get overall results
|
||||||
cm.get_total_models_scores(n_clusters=n_clusters)
|
cm.get_total_models_scores(n_clusters=n_clusters)
|
||||||
|
|
||||||
# %%
|
|
||||||
final_scores = pd.DataFrame()
|
|
||||||
for model in cmodels:
|
|
||||||
final_scores = pd.concat([final_scores,pd.DataFrame.from_dict(cmodels[model])])
|
|
||||||
|
|
||||||
# %%
|
|
||||||
final_scores
|
|
||||||
|
|
||||||
# %%
|
|
||||||
|
|
||||||
|
|
|
@ -15,26 +15,18 @@
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
# %matplotlib inline
|
# %matplotlib inline
|
||||||
import datetime
|
|
||||||
import importlib
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import seaborn as sns
|
|
||||||
from scipy import stats
|
from scipy import stats
|
||||||
|
|
||||||
from sklearn.model_selection import LeaveOneGroupOut, cross_validate, train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
from sklearn.impute import SimpleImputer
|
from sklearn.impute import SimpleImputer
|
||||||
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
||||||
|
|
||||||
from sklearn.dummy import DummyClassifier
|
|
||||||
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
|
|
||||||
from lightgbm import LGBMClassifier
|
|
||||||
import xgboost as xg
|
|
||||||
|
|
||||||
from sklearn.cluster import KMeans
|
from sklearn.cluster import KMeans
|
||||||
|
|
||||||
from IPython.core.interactiveshell import InteractiveShell
|
from IPython.core.interactiveshell import InteractiveShell
|
||||||
|
@ -44,8 +36,6 @@ nb_dir = os.path.split(os.getcwd())[0]
|
||||||
if nb_dir not in sys.path:
|
if nb_dir not in sys.path:
|
||||||
sys.path.append(nb_dir)
|
sys.path.append(nb_dir)
|
||||||
|
|
||||||
import machine_learning.labels
|
|
||||||
import machine_learning.model
|
|
||||||
from machine_learning.classification_models import ClassificationModels
|
from machine_learning.classification_models import ClassificationModels
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
|
|
|
@ -47,6 +47,7 @@ import machine_learning.helper
|
||||||
|
|
||||||
# %% tags=["active-ipynb"]
|
# %% tags=["active-ipynb"]
|
||||||
# filename = Path("E:/STRAWresults/inputData/stressfulness_event/input_appraisal_stressfulness_event_mean.csv")
|
# filename = Path("E:/STRAWresults/inputData/stressfulness_event/input_appraisal_stressfulness_event_mean.csv")
|
||||||
|
# filename = Path('C:/Users/Primoz/VSCodeProjects/straw2analysis/data/stressfulness_event/input_appraisal_stressfulness_event_mean.csv')
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
final_scores = machine_learning.helper.run_all_regression_models(filename)
|
final_scores = machine_learning.helper.run_all_regression_models(filename)
|
||||||
|
|
Loading…
Reference in New Issue