Debugging of the empatica data yield integration.
parent
7fcdb873fe
commit
f0b87c9dd0
|
@ -8,6 +8,7 @@ from sklearn.preprocessing import StandardScaler
|
|||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
sys.path.append('/rapids/')
|
||||
from src.features import empatica_data_yield as edy
|
||||
|
||||
def straw_cleaning(sensor_data_files, provider):
|
||||
|
@ -22,14 +23,14 @@ def straw_cleaning(sensor_data_files, provider):
|
|||
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
|
||||
|
||||
# (1) FILTER_OUT THE ROWS THAT DO NOT HAVE THE TARGET COLUMN AVAILABLE
|
||||
if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']:
|
||||
target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config
|
||||
features = features[features['phone_esm_straw_' + target].notna()].reset_index(drop=True)
|
||||
# if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']:
|
||||
# target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config
|
||||
# features = features[features['phone_esm_straw_' + target].notna()].reset_index(drop=True)
|
||||
|
||||
# (2) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows)
|
||||
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
|
||||
|
||||
# (3.1) QUALITY CHECK (DATA YIELD COLUMN) which determines if the row stays or not (if either E4 or phone is low quality the row is useless - TODO: determine threshold)
|
||||
# (3.1) QUALITY CHECK (DATA YIELD COLUMN) deletes the rows where E4 or phone data is low quality
|
||||
phone_data_yield_unit = provider["PHONE_DATA_YIELD_FEATURE"].split("_")[3].lower()
|
||||
phone_data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + phone_data_yield_unit
|
||||
|
||||
|
@ -111,12 +112,11 @@ def straw_cleaning(sensor_data_files, provider):
|
|||
if features.isna().any().any():
|
||||
raise ValueError
|
||||
|
||||
sys.exit()
|
||||
|
||||
return features
|
||||
|
||||
def graph_bf_af(features, phase_name):
|
||||
sns.set(rc={"figure.figsize":(16, 8)})
|
||||
print(features)
|
||||
sns.heatmap(features.isna(), cbar=False)
|
||||
plt.savefig(f'features_nans_{phase_name}.png', bbox_inches='tight')
|
||||
|
||||
|
|
|
@ -6,8 +6,8 @@ import sys
|
|||
|
||||
def calculate_empatica_data_yield(features):
|
||||
# Get time segment duration in seconds from dataframe
|
||||
datetime_start = datetime.strptime(df.loc[0, 'local_segment_start_datetime'], '%y-%m-%d %H:%M:%S')
|
||||
datetime_end = datetime.strptime(df.loc[0, 'local_segment_end_datetime'], '%y-%m-%d %H:%M:%S')
|
||||
datetime_start = datetime.strptime(features.loc[0, 'local_segment_start_datetime'], '%Y-%m-%d %H:%M:%S')
|
||||
datetime_end = datetime.strptime(features.loc[0, 'local_segment_end_datetime'], '%Y-%m-%d %H:%M:%S')
|
||||
tseg_duration = (datetime_end - datetime_start).total_seconds()
|
||||
|
||||
features["acc_data_yield"] = (features['empatica_accelerometer_cr_SO_windowsCount'] * 15) / tseg_duration
|
||||
|
@ -15,12 +15,10 @@ def calculate_empatica_data_yield(features):
|
|||
features["eda_data_yield"] = (features['empatica_electrodermal_activity_cr_SO_windowsCount'] * 60) / tseg_duration
|
||||
features["ibi_data_yield"] = (features['empatica_inter_beat_interval_cr_SO_windowsCount'] * 300) / tseg_duration
|
||||
|
||||
features["empatica_data_yield"] = features[['acc_data_yield', 'temp_data_yield', 'eda_data_yield', 'ibi_data_yield']].mean(axis=1)
|
||||
|
||||
# TODO: morda smisleno obdelovati različne senzorje ločeno -> lahko da ibi ne bo dobre kvalitete, ostali pa bodo okej. Zakaj bi samo zaradi IBI zavrgli celotno vrstico ...
|
||||
# lahko se tudi naredi overall kvaliteta empatice npr. povprečje vseh data_yield rezultatov? Oz. povprečje z utežmi glede na število stolpcev, ki jih senzor vsebuje
|
||||
# ... čeprav št. stolpcev ni najboljše, saj je pomembnost nekaterih (npr. EDA) značilk zelo vprašljiva.
|
||||
# TODO: boljša nastavitev delovnih ur sedaj je od 4 do 4... to povzroči veliko manjkajočih podatkov in posledično nizek (telefonski in E4) data_yield ...
|
||||
# TODO: boljša nastavitev delovnih ur sedaj je od 4:00 do 4:00... to povzroči veliko manjkajočih podatkov in posledično nizek (telefonski in E4) data_yield ...
|
||||
empatica_data_yield_cols = ['acc_data_yield', 'temp_data_yield', 'eda_data_yield', 'ibi_data_yield']
|
||||
features["empatica_data_yield"] = features[empatica_data_yield_cols].mean(axis=1)
|
||||
features.drop(empatica_data_yield_cols, axis=1, inplace=True) # In case of if the advanced operations will later not be needed (e.g., weighted average)
|
||||
|
||||
return features
|
||||
|
||||
|
|
Loading…
Reference in New Issue