Debugging of the empatica data yield integration.

notes
Primoz 2022-09-27 09:54:15 +00:00
parent 7fcdb873fe
commit f0b87c9dd0
4 changed files with 12 additions and 14 deletions

View File

@ -8,6 +8,7 @@ from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import seaborn as sns import seaborn as sns
sys.path.append('/rapids/')
from src.features import empatica_data_yield as edy from src.features import empatica_data_yield as edy
def straw_cleaning(sensor_data_files, provider): def straw_cleaning(sensor_data_files, provider):
@ -22,14 +23,14 @@ def straw_cleaning(sensor_data_files, provider):
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime'] excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
# (1) FILTER_OUT THE ROWS THAT DO NOT HAVE THE TARGET COLUMN AVAILABLE # (1) FILTER_OUT THE ROWS THAT DO NOT HAVE THE TARGET COLUMN AVAILABLE
if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']: # if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']:
target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config # target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config
features = features[features['phone_esm_straw_' + target].notna()].reset_index(drop=True) # features = features[features['phone_esm_straw_' + target].notna()].reset_index(drop=True)
# (2) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows) # (2) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows)
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]] features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
# (3.1) QUALITY CHECK (DATA YIELD COLUMN) which determines if the row stays or not (if either E4 or phone is low quality the row is useless - TODO: determine threshold) # (3.1) QUALITY CHECK (DATA YIELD COLUMN) deletes the rows where E4 or phone data is low quality
phone_data_yield_unit = provider["PHONE_DATA_YIELD_FEATURE"].split("_")[3].lower() phone_data_yield_unit = provider["PHONE_DATA_YIELD_FEATURE"].split("_")[3].lower()
phone_data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + phone_data_yield_unit phone_data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + phone_data_yield_unit
@ -111,12 +112,11 @@ def straw_cleaning(sensor_data_files, provider):
if features.isna().any().any(): if features.isna().any().any():
raise ValueError raise ValueError
sys.exit()
return features return features
def graph_bf_af(features, phase_name): def graph_bf_af(features, phase_name):
sns.set(rc={"figure.figsize":(16, 8)}) sns.set(rc={"figure.figsize":(16, 8)})
print(features)
sns.heatmap(features.isna(), cbar=False) sns.heatmap(features.isna(), cbar=False)
plt.savefig(f'features_nans_{phase_name}.png', bbox_inches='tight') plt.savefig(f'features_nans_{phase_name}.png', bbox_inches='tight')

View File

@ -6,8 +6,8 @@ import sys
def calculate_empatica_data_yield(features): def calculate_empatica_data_yield(features):
# Get time segment duration in seconds from dataframe # Get time segment duration in seconds from dataframe
datetime_start = datetime.strptime(df.loc[0, 'local_segment_start_datetime'], '%y-%m-%d %H:%M:%S') datetime_start = datetime.strptime(features.loc[0, 'local_segment_start_datetime'], '%Y-%m-%d %H:%M:%S')
datetime_end = datetime.strptime(df.loc[0, 'local_segment_end_datetime'], '%y-%m-%d %H:%M:%S') datetime_end = datetime.strptime(features.loc[0, 'local_segment_end_datetime'], '%Y-%m-%d %H:%M:%S')
tseg_duration = (datetime_end - datetime_start).total_seconds() tseg_duration = (datetime_end - datetime_start).total_seconds()
features["acc_data_yield"] = (features['empatica_accelerometer_cr_SO_windowsCount'] * 15) / tseg_duration features["acc_data_yield"] = (features['empatica_accelerometer_cr_SO_windowsCount'] * 15) / tseg_duration
@ -15,12 +15,10 @@ def calculate_empatica_data_yield(features):
features["eda_data_yield"] = (features['empatica_electrodermal_activity_cr_SO_windowsCount'] * 60) / tseg_duration features["eda_data_yield"] = (features['empatica_electrodermal_activity_cr_SO_windowsCount'] * 60) / tseg_duration
features["ibi_data_yield"] = (features['empatica_inter_beat_interval_cr_SO_windowsCount'] * 300) / tseg_duration features["ibi_data_yield"] = (features['empatica_inter_beat_interval_cr_SO_windowsCount'] * 300) / tseg_duration
features["empatica_data_yield"] = features[['acc_data_yield', 'temp_data_yield', 'eda_data_yield', 'ibi_data_yield']].mean(axis=1) # TODO: boljša nastavitev delovnih ur sedaj je od 4:00 do 4:00... to povzroči veliko manjkajočih podatkov in posledično nizek (telefonski in E4) data_yield ...
empatica_data_yield_cols = ['acc_data_yield', 'temp_data_yield', 'eda_data_yield', 'ibi_data_yield']
# TODO: morda smisleno obdelovati različne senzorje ločeno -> lahko da ibi ne bo dobre kvalitete, ostali pa bodo okej. Zakaj bi samo zaradi IBI zavrgli celotno vrstico ... features["empatica_data_yield"] = features[empatica_data_yield_cols].mean(axis=1)
# lahko se tudi naredi overall kvaliteta empatice npr. povprečje vseh data_yield rezultatov? Oz. povprečje z utežmi glede na število stolpcev, ki jih senzor vsebuje features.drop(empatica_data_yield_cols, axis=1, inplace=True) # In case of if the advanced operations will later not be needed (e.g., weighted average)
# ... čeprav št. stolpcev ni najboljše, saj je pomembnost nekaterih (npr. EDA) značilk zelo vprašljiva.
# TODO: boljša nastavitev delovnih ur sedaj je od 4 do 4... to povzroči veliko manjkajočih podatkov in posledično nizek (telefonski in E4) data_yield ...
return features return features