2023-01-19 16:20:43 +01:00
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %matplotlib inline
import os , sys , math
import numpy as np
import matplotlib . pyplot as plt
import pandas as pd
from sklearn . impute import SimpleImputer
2023-02-06 11:09:15 +01:00
from sklearn . naive_bayes import GaussianNB
from sklearn . model_selection import train_test_split , cross_validate , StratifiedKFold
2023-01-19 16:20:43 +01:00
from sklearn import metrics
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
index_columns = [ " local_segment " , " local_segment_label " , " local_segment_start_datetime " , " local_segment_end_datetime " ]
model_input = pd . read_csv ( " ../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv " ) . set_index ( index_columns )
categorical_feature_colnames = [ " gender " , " startlanguage " ]
additional_categorical_features = [ col for col in model_input . columns if " mostcommonactivity " in col or " homelabel " in col ]
categorical_feature_colnames + = additional_categorical_features
categorical_features = model_input [ categorical_feature_colnames ] . copy ( )
mode_categorical_features = categorical_features . mode ( ) . iloc [ 0 ]
# fillna with mode
categorical_features = categorical_features . fillna ( mode_categorical_features )
# one-hot encoding
categorical_features = categorical_features . apply ( lambda col : col . astype ( " category " ) )
if not categorical_features . empty :
categorical_features = pd . get_dummies ( categorical_features )
numerical_features = model_input . drop ( categorical_feature_colnames , axis = 1 )
model_input = pd . concat ( [ numerical_features , categorical_features ] , axis = 1 )
# Binarizacija targeta
bins = [ - 1 , 0 , 4 ] # bins for stressfulness (0-4) target
model_input [ ' target ' ] , edges = pd . cut ( model_input . target , bins = bins , labels = [ 0 , 1 ] , retbins = True , right = True )
print ( " Non-numeric cols (or target): " , list ( model_input . columns . difference ( model_input . select_dtypes ( include = np . number ) . columns ) ) )
print ( " Shapes of numeric df: " , model_input . shape , model_input . select_dtypes ( include = np . number ) . shape )
# %%
2023-01-23 16:32:07 +01:00
# Add prefix to demographical features
demo_features = [ ' age ' , ' limesurvey_demand ' , ' limesurvey_control ' , ' limesurvey_demand_control_ratio ' , ' limesurvey_demand_control_ratio_quartile ' ,
' gender_F ' , ' gender_M ' , ' startlanguage_nl ' , ' startlanguage_sl ' ]
2023-01-19 16:20:43 +01:00
2023-01-23 16:32:07 +01:00
new_names = [ ( col , " demo_ " + col ) for col in demo_features ]
model_input . rename ( columns = dict ( new_names ) , inplace = True )
demo_features = [ ' demo_age ' , ' demo_limesurvey_demand ' , ' demo_limesurvey_control ' , ' demo_limesurvey_demand_control_ratio ' ,
' demo_limesurvey_demand_control_ratio_quartile ' , ' target ' , ' demo_gender_F ' , ' demo_gender_M ' ,
' demo_startlanguage_nl ' , ' demo_startlanguage_sl ' ]
# %%
2023-01-19 16:20:43 +01:00
# Get phone and non-phone columns
2023-02-01 13:51:56 +01:00
import warnings
2023-01-19 16:20:43 +01:00
2023-02-01 13:51:56 +01:00
def make_predictions_with_sensor_groups ( df , groups_substrings , include_group = True , with_cols = [ ] , print_flag = False ) :
2023-01-19 16:20:43 +01:00
"""
This function makes predictions with sensor groups .
It takes in a dataframe ( df ) , a list of group substrings ( groups_substrings )
and an optional parameter include_group ( default is True ) .
It creates a list of columns in the dataframe that contain the group substrings ,
while excluding the ' pid ' and ' target ' columns . It then splits the data into training
and test sets , using a test size of 0.25 for the first split and 0.2 for the second split .
A SimpleImputer is used to fill in missing values with median values .
2023-02-06 11:09:15 +01:00
A LogisticRegression is then used to fit the training set and make predictions
2023-01-19 16:20:43 +01:00
on the test set . Finally , accuracy , precision , recall and F1 scores are printed
for each substring group depending on whether or not include_group
is set to True or False .
"""
2023-01-25 14:19:29 +01:00
best_sensor = None
2023-02-06 11:09:15 +01:00
best_recall_score , best_f1_score = None , None
2023-01-25 14:19:29 +01:00
2023-01-19 16:20:43 +01:00
for fgroup_substr in groups_substrings :
2023-01-23 16:32:07 +01:00
if fgroup_substr is None :
feature_group_cols = list ( df . columns )
feature_group_cols . remove ( " pid " )
feature_group_cols . remove ( " target " )
else :
if include_group :
feature_group_cols = [ col for col in df . columns if fgroup_substr in col and col not in [ ' pid ' , ' target ' ] ]
else :
feature_group_cols = [ col for col in df . columns if fgroup_substr not in col and col not in [ ' pid ' , ' target ' ] ]
2023-01-19 16:20:43 +01:00
2023-01-25 14:19:29 +01:00
X , y = df . drop ( columns = [ ' target ' , ' pid ' ] ) [ feature_group_cols + with_cols ] , df [ ' target ' ]
2023-02-01 13:51:56 +01:00
X , _ , y , _ = train_test_split ( X , y , stratify = y , random_state = 19 , test_size = 0.2 )
2023-02-06 11:09:15 +01:00
imputer = SimpleImputer ( missing_values = np . nan , strategy = ' median ' )
nb = GaussianNB ( )
model_cv = cross_validate (
nb ,
X = imputer . fit_transform ( X ) ,
y = y ,
cv = StratifiedKFold ( n_splits = 5 , shuffle = True ) ,
n_jobs = - 1 ,
scoring = ( ' accuracy ' , ' precision ' , ' recall ' , ' f1 ' )
)
2023-02-01 13:51:56 +01:00
X_train , X_test , y_train , y_test = train_test_split ( X , y , stratify = y , random_state = 2 , test_size = 0.2 )
2023-01-19 16:20:43 +01:00
2023-02-01 13:51:56 +01:00
if print_flag :
if include_group :
print ( " \n Prediction with " , fgroup_substr )
else :
print ( " \n Prediction without " , fgroup_substr )
2023-01-19 16:20:43 +01:00
2023-02-01 13:51:56 +01:00
with warnings . catch_warnings ( ) :
warnings . filterwarnings ( " ignore " , message = " Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. " )
2023-01-25 14:19:29 +01:00
2023-02-06 11:09:15 +01:00
acc = np . mean ( model_cv [ ' test_accuracy ' ] )
acc_std = np . std ( model_cv [ ' test_accuracy ' ] )
prec = np . mean ( model_cv [ ' test_precision ' ] )
prec_std = np . std ( model_cv [ ' test_precision ' ] )
rec = np . mean ( model_cv [ ' test_recall ' ] )
rec_std = np . std ( model_cv [ ' test_recall ' ] )
f1 = np . mean ( model_cv [ ' test_f1 ' ] )
f1_std = np . std ( model_cv [ ' test_f1 ' ] )
2023-01-25 14:19:29 +01:00
2023-02-01 13:51:56 +01:00
if print_flag :
print ( " ************************************************ " )
2023-02-06 11:09:15 +01:00
print ( f " Accuracy: { acc } (sd= { acc_std } ) " )
print ( f " Precison: { prec } (sd= { prec_std } ) " )
print ( f " Recall: { rec } (sd= { rec_std } ) " )
print ( f " F1: { f1 } (sd= { f1_std } ) \n " )
2023-01-25 14:19:29 +01:00
2023-02-06 11:09:15 +01:00
if ( not best_recall_score and not best_f1_score ) or ( rec > best_recall_score ) :
2023-01-25 14:19:29 +01:00
best_sensor = fgroup_substr
2023-02-06 11:09:15 +01:00
best_recall_score , best_f1_score = rec , f1
best_recall_score_std , best_f1_score_std = rec_std , f1_std
2023-01-25 14:19:29 +01:00
2023-02-06 11:09:15 +01:00
return best_sensor , best_recall_score , best_f1_score , best_recall_score_std , best_f1_score_std
2023-01-23 16:32:07 +01:00
# %% [markdown]
2023-02-06 11:09:15 +01:00
# ### sensor big feature groups (phone, empatica, demographical)
2023-01-25 14:19:29 +01:00
big_groups_substr = [ " phone_ " , " empatica_ " , " demo_ " ]
make_predictions_with_sensor_groups ( model_input . copy ( ) , groups_substrings = big_groups_substr , include_group = False )
2023-01-19 16:20:43 +01:00
2023-01-23 16:32:07 +01:00
# %% [markdown]
# ### Empatica sezor groups
2023-01-25 14:19:29 +01:00
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
# e4_sensors = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=e4_sensors, include_group=False)
2023-01-19 16:20:43 +01:00
2023-01-23 16:32:07 +01:00
# %% [markdown]
# ### Phone sensor groups
2023-01-25 14:19:29 +01:00
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
2023-02-06 11:16:53 +01:00
# phone_sensors = ["phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_",
2023-01-25 14:19:29 +01:00
# "phone_light_", "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=phone_sensors, include_group=False)
2023-01-19 16:20:43 +01:00
# %%
2023-01-23 16:32:07 +01:00
# Write all the sensors (phone, empatica), seperate other (demographical) cols also
2023-01-19 16:20:43 +01:00
2023-01-25 14:19:29 +01:00
sensors_features_groups = [ " empatica_inter_beat_ " , " empatica_accelerometer_ " , " empatica_temperature_ " , " empatica_electrodermal_ " ,
2023-02-06 11:16:53 +01:00
" phone_activity_ " , " phone_applications_ " , " phone_bluetooth_ " , " phone_battery_ " , " phone_calls_ " , " phone_light_ " ,
2023-01-25 14:19:29 +01:00
" phone_locations_ " , " phone_messages " , " phone_screen_ " ] # , "phone_speech_"]
# %%
def find_sensor_group_features_importance ( model_input , sensor_groups_strings ) :
2023-02-01 13:51:56 +01:00
"""
This function finds the importance of sensor groups for a given model input . It takes two parameters :
model_input and sensor_groups_strings . It creates an empty list called sensor_importance_scores ,
which will be populated with tuples containing the best sensor , its recall score , and its F1 score .
It then makes a copy of the model input and the sensor groups strings . It then loops through each group
in the list of strings , creating a list of important columns from the sensor importance scores list .
It then calls make_predictions_with_sensor_groups to determine the best sensor , its recall score ,
and its F1 score . These values are added to the sensor importance scores list as a tuple . The function
then removes that best sensor from the list of strings before looping again until all groups have been evaluated .
Finally , it returns the populated list of tuples containing all sensors ' scores.
"""
2023-01-25 14:19:29 +01:00
sensor_importance_scores = [ ]
model_input = model_input . copy ( )
sensor_groups_strings = sensor_groups_strings . copy ( )
groups_len = len ( sensor_groups_strings )
for i in range ( groups_len ) :
important_cols = [ col [ 0 ] for col in sensor_importance_scores ]
with_cols = [ col for col in model_input . columns if any ( col . startswith ( y ) for y in important_cols ) ]
2023-01-19 16:20:43 +01:00
2023-02-06 11:09:15 +01:00
best_sensor , best_recall_score , best_f1_sore , best_recall_score_std , best_f1_score_std = \
2023-01-25 14:19:29 +01:00
make_predictions_with_sensor_groups ( model_input ,
groups_substrings = sensor_groups_strings , include_group = True ,
with_cols = with_cols )
2023-02-06 11:09:15 +01:00
sensor_importance_scores . append ( ( best_sensor , best_recall_score , best_f1_sore , best_recall_score_std , best_f1_score_std ) )
2023-01-25 14:19:29 +01:00
print ( f " \n Added sensor: { best_sensor } \n " )
sensor_groups_strings . remove ( best_sensor )
return sensor_importance_scores
2023-01-19 16:20:43 +01:00
2023-02-01 13:51:56 +01:00
# %%
# Method for sorting list of tuples into 3 lists
def sort_tuples_to_lists ( list_of_tuples ) :
"""
sort_tuples_to_lists ( list_of_tuples ) is a method that takes in a list of tuples as an argument
and sorts them into three separate lists . The first list , xs , contains the first element
of each tuple . The second list , yrecall , contains the second element of each tuple rounded
to 4 decimal places . The third list , y_fscore , contains the third element of each tuple
rounded to 4 decimal places . The method returns all three lists .
"""
2023-02-06 11:09:15 +01:00
xs , y_recall , y_fscore , recall_std , fscore_std = [ ] , [ ] , [ ] , [ ] , [ ]
2023-02-01 13:51:56 +01:00
for a_tuple in list_of_tuples :
xs . append ( a_tuple [ 0 ] )
y_recall . append ( round ( a_tuple [ 1 ] , 4 ) )
y_fscore . append ( round ( a_tuple [ 2 ] , 4 ) )
2023-02-06 11:09:15 +01:00
recall_std . append ( round ( a_tuple [ 3 ] , 4 ) )
fscore_std . append ( round ( a_tuple [ 4 ] , 4 ) )
return xs , y_recall , y_fscore , recall_std , fscore_std
2023-02-01 13:51:56 +01:00
2023-02-06 11:09:15 +01:00
def plot_sequential_progress_of_feature_addition_scores ( xs , y_recall , y_fscore , recall_std , fscore_std ,
title = " Sequential addition of features and its F1, and recall scores " ) :
2023-02-01 13:51:56 +01:00
"""
This function plots the sequential progress of feature addition scores using two subplots .
The first subplot is for recall scores and the second subplot is for F1 - scores .
The parameters xs , yrecall , and yfscore are used to plot the data on the respective axes .
The title of the plot can be specified by the user using the parameter title .
The maximum recall index and maximum F1 - score index are also plotted using a black dot .
The figure size is set to 18.5 inches in width and 10.5 inches in height ,
and the x - axis labels are rotated by 90 degrees . Finally , the plot is displayed
using plt . show ( ) .
"""
fig , ax = plt . subplots ( nrows = 2 , sharex = True )
2023-02-06 11:09:15 +01:00
ax [ 0 ] . plot ( xs , np . array ( y_recall ) + np . array ( recall_std ) , linestyle = " : " , color = ' m ' ) # Upper SD
2023-02-01 13:51:56 +01:00
ax [ 0 ] . plot ( xs , y_recall , color = ' red ' )
2023-02-06 11:09:15 +01:00
ax [ 0 ] . plot ( xs , np . array ( y_recall ) - np . array ( recall_std ) , linestyle = " : " , color = ' m ' ) # Lower SD
2023-02-01 13:51:56 +01:00
mrec_indx = np . argmax ( y_recall )
ax [ 0 ] . plot ( xs [ mrec_indx ] , y_recall [ mrec_indx ] , " -o " , color = ' black ' )
2023-02-06 11:09:15 +01:00
ax [ 0 ] . legend ( [ " Upper std " , " Mean Recall " , " Lower std " ] )
2023-02-01 13:51:56 +01:00
2023-02-06 11:09:15 +01:00
ax [ 1 ] . plot ( xs , np . array ( y_fscore ) + np . array ( fscore_std ) , linestyle = " : " , color = ' c ' ) # Upper SD
2023-02-01 13:51:56 +01:00
ax [ 1 ] . plot ( xs , y_fscore )
2023-02-06 11:09:15 +01:00
ax [ 1 ] . plot ( xs , np . array ( y_fscore ) - np . array ( fscore_std ) , linestyle = " : " , color = ' c ' ) # Lower SD
2023-02-01 13:51:56 +01:00
mfscore_indx = np . argmax ( y_fscore )
ax [ 1 ] . plot ( xs [ mfscore_indx ] , y_fscore [ mfscore_indx ] , " -o " , color = ' black ' )
2023-02-06 11:09:15 +01:00
ax [ 1 ] . legend ( [ " Upper std " , " Mean F1-score " , " Lower std " ] )
2023-02-01 13:51:56 +01:00
fig . set_size_inches ( 18.5 , 10.5 )
ax [ 0 ] . title . set_text ( ' Recall scores ' )
ax [ 1 ] . title . set_text ( ' F1-scores ' )
plt . suptitle ( title , fontsize = 14 )
plt . xticks ( rotation = 90 )
plt . show ( )
2023-02-06 11:09:15 +01:00
# %%
sensors_features_groups = [ " empatica_inter_beat_ " , " empatica_accelerometer_ " , " empatica_temperature_ " , " empatica_electrodermal_ " ,
2023-02-06 11:16:53 +01:00
" phone_activity_ " , " phone_applications_ " , " phone_bluetooth_ " , " phone_battery_ " , " phone_calls_ " , " phone_light_ " ,
2023-02-06 11:09:15 +01:00
" phone_locations_ " , " phone_messages " , " phone_screen_ " ] # , "phone_speech_"]
2023-02-01 13:51:56 +01:00
2023-02-06 11:09:15 +01:00
# sensors_features_groups = ["phone_", "empatica_", "demo_"]
# %%
2023-01-25 14:19:29 +01:00
# sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr)
sensor_groups_importance_scores = find_sensor_group_features_importance ( model_input , sensors_features_groups )
2023-02-06 11:09:15 +01:00
xs , y_recall , y_fscore , recall_std , fscore_std = sort_tuples_to_lists ( sensor_groups_importance_scores )
2023-02-01 13:51:56 +01:00
# %% [markdown]
# ### Visualize sensors groups F1 and recall scores
print ( sensor_groups_importance_scores )
2023-02-06 11:21:17 +01:00
plot_sequential_progress_of_feature_addition_scores ( xs , y_recall , y_fscore , recall_std , fscore_std ,
2023-02-06 11:09:15 +01:00
title = " Sequential addition of sensors and its F1, and recall scores " )
2023-01-19 16:20:43 +01:00
2023-01-25 14:19:29 +01:00
# %%
2023-02-01 13:51:56 +01:00
# Take the most important feature group and investigate it feature-by-feature
best_sensor_group = sensor_groups_importance_scores [ 0 ] [ 0 ] # take the highest rated sensor group
2023-01-25 14:19:29 +01:00
best_sensor_features = [ col for col in model_input if col . startswith ( best_sensor_group ) ]
2023-02-06 11:31:21 +01:00
# best_sensor_features_scores = find_sensor_group_features_importance(model_input, best_sensor_features)
2023-02-01 13:51:56 +01:00
2023-02-06 11:31:21 +01:00
# xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(best_sensor_features_scores)
2023-02-01 13:51:56 +01:00
# %% [markdown]
# ### Visualize best sensor's F1 and recall scores
2023-02-06 11:31:21 +01:00
# print(best_sensor_features_scores)
# plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
# title="Best sensor addition it's features with F1 and recall scores")
2023-01-19 16:20:43 +01:00
# %%
2023-02-01 13:51:56 +01:00
# This section iterates over all sensor groups and investigates sequential feature importance feature-by-feature
2023-02-01 15:13:57 +01:00
# It also saves the sequence of scores for all sensors' features in excel file
seq_columns = [ " sensor_name " , " feature_sequence " , " recall " , " f1_score " ]
feature_sequence = pd . DataFrame ( columns = seq_columns )
for i , sensor_group in enumerate ( sensor_groups_importance_scores ) :
2023-02-01 13:51:56 +01:00
current_sensor_features = [ col for col in model_input if col . startswith ( sensor_group [ 0 ] ) ]
current_sensor_features_scores = find_sensor_group_features_importance ( model_input , current_sensor_features )
2023-02-06 11:16:53 +01:00
xs , y_recall , y_fscore , recall_std , fscore_std = sort_tuples_to_lists ( current_sensor_features_scores )
2023-02-06 11:54:54 +01:00
feature_sequence = pd . concat ( [ feature_sequence , pd . DataFrame ( { " sensor_name " : sensor_group [ 0 ] , " feature_sequence " : [ xs ] , " recall " : [ y_recall ] ,
2023-02-06 11:31:21 +01:00
" f1_score " : [ y_fscore ] , " recall_std " : [ recall_std ] , " f1_std " : [ fscore_std ] } ) ] )
2023-01-25 14:19:29 +01:00
2023-02-06 11:16:53 +01:00
plot_sequential_progress_of_feature_addition_scores ( xs , y_recall , y_fscore , recall_std , fscore_std ,
2023-02-01 13:51:56 +01:00
title = f " Sequential addition of features for { sensor_group [ 0 ] } and its F1, and recall scores " )
2023-02-01 15:13:57 +01:00
feature_sequence . to_excel ( " all_sensors_sequential_addition_scores.xlsx " , index = False )
2023-02-01 13:51:56 +01:00
# %%
2023-02-01 15:13:57 +01:00
# TODO: method that reads data from the excel file, specified above, and then the method,
# that selects only features that are max a thresh[%] below the max value (best for recall
# possibly for f1). This method should additionally take threshold parameter.
# %%