Debugging of the empatica data yield integration.

2022-09-27 09:54:15 +00:00 · 2022-09-27 09:54:15 +00:00 · f0b87c9dd0
parent 7fcdb873fe
commit f0b87c9dd0
4 changed files with 12 additions and 14 deletions
--- a/src/features/all_cleaning_individual/straw/init.py
+++ b/src/features/all_cleaning_individual/straw/init.py
--- a/src/features/all_cleaning_individual/straw/main.py
+++ b/src/features/all_cleaning_individual/straw/main.py
@ -8,6 +8,7 @@ from sklearn.preprocessing import StandardScaler
 import matplotlib.pyplot as plt
 import seaborn as sns

+sys.path.append('/rapids/')
 from src.features import empatica_data_yield as edy

 def straw_cleaning(sensor_data_files, provider):
@ -22,14 +23,14 @@ def straw_cleaning(sensor_data_files, provider):
    excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']

    # (1) FILTER_OUT THE ROWS THAT DO NOT HAVE THE TARGET COLUMN AVAILABLE
-    if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']:
-        target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config
-        features = features[features['phone_esm_straw_' + target].notna()].reset_index(drop=True)
+    # if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']:
+    #     target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config
+    #     features = features[features['phone_esm_straw_' + target].notna()].reset_index(drop=True)

    # (2) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows)
    features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]

-    # (3.1) QUALITY CHECK (DATA YIELD COLUMN) which determines if the row stays or not (if either E4 or phone is low quality the row is useless - TODO: determine threshold)
+    # (3.1) QUALITY CHECK (DATA YIELD COLUMN) deletes the rows where E4 or phone data is low quality
    phone_data_yield_unit = provider["PHONE_DATA_YIELD_FEATURE"].split("_")[3].lower()
    phone_data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + phone_data_yield_unit

@ -111,12 +112,11 @@ def straw_cleaning(sensor_data_files, provider):
    if features.isna().any().any():
        raise ValueError

-    sys.exit()
-
    return features

 def graph_bf_af(features, phase_name):
    sns.set(rc={"figure.figsize":(16, 8)})
+    print(features)
    sns.heatmap(features.isna(), cbar=False)
    plt.savefig(f'features_nans_{phase_name}.png', bbox_inches='tight')

--- a/src/features/all_cleaning_overall/straw/init.py
+++ b/src/features/all_cleaning_overall/straw/init.py
--- a/src/features/empatica_data_yield.py
+++ b/src/features/empatica_data_yield.py
@ -6,8 +6,8 @@ import sys

 def calculate_empatica_data_yield(features):
    # Get time segment duration in seconds from dataframe
-    datetime_start = datetime.strptime(df.loc[0, 'local_segment_start_datetime'], '%y-%m-%d %H:%M:%S')
-    datetime_end = datetime.strptime(df.loc[0, 'local_segment_end_datetime'], '%y-%m-%d %H:%M:%S')
+    datetime_start = datetime.strptime(features.loc[0, 'local_segment_start_datetime'], '%Y-%m-%d %H:%M:%S')
+    datetime_end = datetime.strptime(features.loc[0, 'local_segment_end_datetime'], '%Y-%m-%d %H:%M:%S')
    tseg_duration = (datetime_end - datetime_start).total_seconds()

    features["acc_data_yield"] = (features['empatica_accelerometer_cr_SO_windowsCount'] * 15) / tseg_duration
@ -15,12 +15,10 @@ def calculate_empatica_data_yield(features):
    features["eda_data_yield"] = (features['empatica_electrodermal_activity_cr_SO_windowsCount'] * 60) / tseg_duration
    features["ibi_data_yield"] = (features['empatica_inter_beat_interval_cr_SO_windowsCount'] * 300) / tseg_duration

-    features["empatica_data_yield"] = features[['acc_data_yield', 'temp_data_yield', 'eda_data_yield', 'ibi_data_yield']].mean(axis=1)
-
-    # TODO: morda smisleno obdelovati različne senzorje ločeno -> lahko da ibi ne bo dobre kvalitete, ostali pa bodo okej. Zakaj bi samo zaradi IBI zavrgli celotno vrstico ...
-    # lahko se tudi naredi overall kvaliteta empatice npr. povprečje vseh data_yield rezultatov? Oz. povprečje z utežmi glede na število stolpcev, ki jih senzor vsebuje
-    # ... čeprav št. stolpcev ni najboljše, saj je pomembnost nekaterih (npr. EDA) značilk zelo vprašljiva.  
-    # TODO: boljša nastavitev delovnih ur sedaj je od 4 do 4... to povzroči veliko manjkajočih podatkov in posledično nizek (telefonski in E4) data_yield ... 
+    # TODO: boljša nastavitev delovnih ur sedaj je od 4:00 do 4:00... to povzroči veliko manjkajočih podatkov in posledično nizek (telefonski in E4) data_yield ... 
+    empatica_data_yield_cols = ['acc_data_yield', 'temp_data_yield', 'eda_data_yield', 'ibi_data_yield']
+    features["empatica_data_yield"] = features[empatica_data_yield_cols].mean(axis=1)
+    features.drop(empatica_data_yield_cols, axis=1, inplace=True) # In case of if the advanced operations will later not be needed (e.g., weighted average)

    return features