Debugging of the empatica data yield integration.
parent
7fcdb873fe
commit
f0b87c9dd0
|
@ -8,6 +8,7 @@ from sklearn.preprocessing import StandardScaler
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
|
|
||||||
|
sys.path.append('/rapids/')
|
||||||
from src.features import empatica_data_yield as edy
|
from src.features import empatica_data_yield as edy
|
||||||
|
|
||||||
def straw_cleaning(sensor_data_files, provider):
|
def straw_cleaning(sensor_data_files, provider):
|
||||||
|
@ -22,14 +23,14 @@ def straw_cleaning(sensor_data_files, provider):
|
||||||
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
|
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
|
||||||
|
|
||||||
# (1) FILTER_OUT THE ROWS THAT DO NOT HAVE THE TARGET COLUMN AVAILABLE
|
# (1) FILTER_OUT THE ROWS THAT DO NOT HAVE THE TARGET COLUMN AVAILABLE
|
||||||
if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']:
|
# if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']:
|
||||||
target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config
|
# target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config
|
||||||
features = features[features['phone_esm_straw_' + target].notna()].reset_index(drop=True)
|
# features = features[features['phone_esm_straw_' + target].notna()].reset_index(drop=True)
|
||||||
|
|
||||||
# (2) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows)
|
# (2) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows)
|
||||||
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
|
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
|
||||||
|
|
||||||
# (3.1) QUALITY CHECK (DATA YIELD COLUMN) which determines if the row stays or not (if either E4 or phone is low quality the row is useless - TODO: determine threshold)
|
# (3.1) QUALITY CHECK (DATA YIELD COLUMN) deletes the rows where E4 or phone data is low quality
|
||||||
phone_data_yield_unit = provider["PHONE_DATA_YIELD_FEATURE"].split("_")[3].lower()
|
phone_data_yield_unit = provider["PHONE_DATA_YIELD_FEATURE"].split("_")[3].lower()
|
||||||
phone_data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + phone_data_yield_unit
|
phone_data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + phone_data_yield_unit
|
||||||
|
|
||||||
|
@ -111,12 +112,11 @@ def straw_cleaning(sensor_data_files, provider):
|
||||||
if features.isna().any().any():
|
if features.isna().any().any():
|
||||||
raise ValueError
|
raise ValueError
|
||||||
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
return features
|
return features
|
||||||
|
|
||||||
def graph_bf_af(features, phase_name):
|
def graph_bf_af(features, phase_name):
|
||||||
sns.set(rc={"figure.figsize":(16, 8)})
|
sns.set(rc={"figure.figsize":(16, 8)})
|
||||||
|
print(features)
|
||||||
sns.heatmap(features.isna(), cbar=False)
|
sns.heatmap(features.isna(), cbar=False)
|
||||||
plt.savefig(f'features_nans_{phase_name}.png', bbox_inches='tight')
|
plt.savefig(f'features_nans_{phase_name}.png', bbox_inches='tight')
|
||||||
|
|
||||||
|
|
|
@ -6,8 +6,8 @@ import sys
|
||||||
|
|
||||||
def calculate_empatica_data_yield(features):
|
def calculate_empatica_data_yield(features):
|
||||||
# Get time segment duration in seconds from dataframe
|
# Get time segment duration in seconds from dataframe
|
||||||
datetime_start = datetime.strptime(df.loc[0, 'local_segment_start_datetime'], '%y-%m-%d %H:%M:%S')
|
datetime_start = datetime.strptime(features.loc[0, 'local_segment_start_datetime'], '%Y-%m-%d %H:%M:%S')
|
||||||
datetime_end = datetime.strptime(df.loc[0, 'local_segment_end_datetime'], '%y-%m-%d %H:%M:%S')
|
datetime_end = datetime.strptime(features.loc[0, 'local_segment_end_datetime'], '%Y-%m-%d %H:%M:%S')
|
||||||
tseg_duration = (datetime_end - datetime_start).total_seconds()
|
tseg_duration = (datetime_end - datetime_start).total_seconds()
|
||||||
|
|
||||||
features["acc_data_yield"] = (features['empatica_accelerometer_cr_SO_windowsCount'] * 15) / tseg_duration
|
features["acc_data_yield"] = (features['empatica_accelerometer_cr_SO_windowsCount'] * 15) / tseg_duration
|
||||||
|
@ -15,12 +15,10 @@ def calculate_empatica_data_yield(features):
|
||||||
features["eda_data_yield"] = (features['empatica_electrodermal_activity_cr_SO_windowsCount'] * 60) / tseg_duration
|
features["eda_data_yield"] = (features['empatica_electrodermal_activity_cr_SO_windowsCount'] * 60) / tseg_duration
|
||||||
features["ibi_data_yield"] = (features['empatica_inter_beat_interval_cr_SO_windowsCount'] * 300) / tseg_duration
|
features["ibi_data_yield"] = (features['empatica_inter_beat_interval_cr_SO_windowsCount'] * 300) / tseg_duration
|
||||||
|
|
||||||
features["empatica_data_yield"] = features[['acc_data_yield', 'temp_data_yield', 'eda_data_yield', 'ibi_data_yield']].mean(axis=1)
|
# TODO: boljša nastavitev delovnih ur sedaj je od 4:00 do 4:00... to povzroči veliko manjkajočih podatkov in posledično nizek (telefonski in E4) data_yield ...
|
||||||
|
empatica_data_yield_cols = ['acc_data_yield', 'temp_data_yield', 'eda_data_yield', 'ibi_data_yield']
|
||||||
# TODO: morda smisleno obdelovati različne senzorje ločeno -> lahko da ibi ne bo dobre kvalitete, ostali pa bodo okej. Zakaj bi samo zaradi IBI zavrgli celotno vrstico ...
|
features["empatica_data_yield"] = features[empatica_data_yield_cols].mean(axis=1)
|
||||||
# lahko se tudi naredi overall kvaliteta empatice npr. povprečje vseh data_yield rezultatov? Oz. povprečje z utežmi glede na število stolpcev, ki jih senzor vsebuje
|
features.drop(empatica_data_yield_cols, axis=1, inplace=True) # In case of if the advanced operations will later not be needed (e.g., weighted average)
|
||||||
# ... čeprav št. stolpcev ni najboljše, saj je pomembnost nekaterih (npr. EDA) značilk zelo vprašljiva.
|
|
||||||
# TODO: boljša nastavitev delovnih ur sedaj je od 4 do 4... to povzroči veliko manjkajočih podatkov in posledično nizek (telefonski in E4) data_yield ...
|
|
||||||
|
|
||||||
return features
|
return features
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue